From 69d63ac76248a45e696d0f7e538d3da0227ebcab Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 10:25:54 +0300
Subject: [PATCH 01/52] add sq8

---
 src/VecSim/spaces/IP/IP.cpp                   |  43 ++
 src/VecSim/spaces/IP/IP.h                     |   6 +
 src/VecSim/spaces/IP/IP_AVX2_SQ8.h            | 104 ++++
 .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h     | 176 ++++++
 src/VecSim/spaces/IP/IP_AVX_SQ8.h             | 116 ++++
 src/VecSim/spaces/IP/IP_SSE_SQ8.h             | 134 +++++
 src/VecSim/spaces/IP/IP_SVE_SQ8.h             | 149 +++++
 src/VecSim/spaces/IP_space.cpp                | 116 ++++
 src/VecSim/spaces/IP_space.h                  |   7 +-
 src/VecSim/spaces/L2/L2.cpp                   |  19 +
 src/VecSim/spaces/L2/L2.h                     |   2 +
 src/VecSim/spaces/L2/L2_AVX_SQ8.h             |  55 ++
 src/VecSim/spaces/L2_space.cpp                |  58 ++
 src/VecSim/spaces/computer/preprocessors.h    | 131 +++++
 src/VecSim/spaces/functions/AVX.cpp           |   7 +
 src/VecSim/spaces/functions/AVX.h             |   1 +
 src/VecSim/spaces/functions/AVX2.cpp          |   7 +
 src/VecSim/spaces/functions/AVX2.h            |   1 +
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |  14 +
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     |   6 +
 src/VecSim/spaces/functions/SSE.cpp           |  13 +
 src/VecSim/spaces/functions/SSE.h             |   2 +
 src/VecSim/spaces/functions/SVE.cpp           |  13 +
 src/VecSim/spaces/functions/SVE.h             |   3 +
 src/VecSim/spaces/functions/SVE2.cpp          |  13 +
 src/VecSim/spaces/functions/SVE2.h            |   3 +
 tests/benchmark/CMakeLists.txt                |   2 +-
 tests/benchmark/benchmarks.sh                 |   5 +
 .../spaces_benchmarks/bm_spaces_sq8.cpp       |  81 +++
 tests/unit/test_spaces.cpp                    | 534 +++++++++++++++++-
 tests/utils/tests_utils.h                     |  51 ++
 31 files changed, 1869 insertions(+), 3 deletions(-)
 create mode 100644 src/VecSim/spaces/IP/IP_AVX2_SQ8.h
 create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h
 create mode 100644 src/VecSim/spaces/IP/IP_AVX_SQ8.h
 create mode 100644 src/VecSim/spaces/IP/IP_SSE_SQ8.h
 create mode 100644 src/VecSim/spaces/IP/IP_SVE_SQ8.h
 create mode 100644 src/VecSim/spaces/L2/L2_AVX_SQ8.h
 create mode 100644 tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp

diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index 638397f0f..a1e5cb8e7 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -10,10 +10,53 @@
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/types/float16.h"
 #include <cstring>
+#include <iostream>
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
+
+float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, float min_val,
+                         float delta, float inv_norm) {
+    float res = 0;
+    std::cout << "\nQuantized values: ";
+    for (size_t i = 0; i < dimension; i++) {
+        float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm;
+        std::cout << dequantized_V2 << ", ";
+        res += pVect1v[i] * dequantized_V2;
+    }
+    std::cout << "\n";
+    std::cout << "res before normalization: " << res << std::endl;
+    return res;
+}
+
+float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // pVect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it.
+    // it is structured as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)]
+    // The last two values are used to dequantize the vector.
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Compute inner product with dequantization
+    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f);
+    return 1.0f - res;
+}
+
+float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    
+    // Get quantization parameters
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+    std::cout << "inv_norm: " << inv_norm << std::endl;
+    // Compute inner product with dequantization
+    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
+    return 1.0f - res;
+}
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) {
     auto *vec1 = (float *)pVect1;
     auto *vec2 = (float *)pVect2;
diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
index a0c5f2838..7dfad24ce 100644
--- a/src/VecSim/spaces/IP/IP.h
+++ b/src/VecSim/spaces/IP/IP.h
@@ -10,6 +10,12 @@
 
 #include <cstdlib>
 
+/*
+    pVect1v vector of type fp32 and pVect2v vector of type int8
+*/
+float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);
+float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
 
 double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
new file mode 100644
index 000000000..6d0dd4af7
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256,
+                                      const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+    
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+    pVect2 += 8;
+    
+    // Zero-extend uint8 to int32 (AVX2 instruction)
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+    
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+    
+    // Dequantize: (val * delta) + min_val
+    // Use FMA instruction available in AVX2 for better performance
+    __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+    
+    // Compute dot product and add to sum (using FMA)
+    sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *quantized = (uint8_t *)pVect2v;
+
+    // Get dequantization parameters from the end of quantized vector
+    float min = *(float *)(quantized + dimension);
+    float delta = *(float *)(quantized + dimension + sizeof(float));
+    
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed
+    if constexpr (residual % 8) {
+        // AVX2 doesn't have native mask loading, so we use the helper function
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        
+        // Load masked float elements
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+        
+        // Load masked uint8 elements
+        __m128i v2_128;
+        if constexpr (residual % 8 <= 4) {
+            // Load 4 or fewer bytes
+            uint32_t temp = 0;
+            memcpy(&temp, quantized, residual % 8);
+            v2_128 = _mm_cvtsi32_si128(temp);
+        } else {
+            // Load 5-7 bytes
+            uint64_t temp = 0;
+            memcpy(&temp, quantized, residual % 8);
+            v2_128 = _mm_cvtsi64_si128(temp);
+        }
+        quantized += residual % 8;
+        
+        // Zero-extend uint8 to int32 (AVX2 instruction)
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+        
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+        
+        // Dequantize: (val * delta) + min (using FMA)
+        __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+        
+        // Compute dot product with masking
+        sum256 = _mm256_mul_ps(v1, v2_dequant);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    while (pVect1 < pEnd1) {
+        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+    }
+
+    // Horizontal sum - AVX2 can use more efficient reduction
+    return 1.0f - my_mm256_reduce_add_ps(sum256);
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h
new file mode 100644
index 000000000..6c001efcf
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <immintrin.h>
+#include <iostream>
+
+static inline void
+SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum,
+                     const __m512 &min_val_vec, const __m512 &delta_vec) {
+    // Load 16 float elements from pVec1
+    __m512 v1 = _mm512_loadu_ps(pVec1);
+    
+    // Load 16 uint8 elements from pVec2 and convert to __m512i
+    __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2);
+    __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+    
+    // Convert uint8 to float
+    __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+    
+    // Dequantize: (val * delta) + min_val
+    __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+    
+    // Compute dot product and add to sum
+    sum = _mm512_fmadd_ps(v1, dequantized, sum);
+    
+    // Advance pointers
+    pVec1 += 16;
+    pVec2 += 16;
+}
+
+// Common implementation for both inner product and cosine similarity
+template <unsigned char residual> // 0..63
+float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const float *pVec1 = static_cast<const float *>(pVec1v);
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+    const uint8_t *pEnd2 = pVec2 + dimension;
+    
+    // Get dequantization parameters from the end of pVec2
+    const float min_val = *reinterpret_cast<const float *>(pVec2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVec2 + dimension + sizeof(float));
+    
+    // Create broadcast vectors for SIMD operations
+    __m512 min_val_vec = _mm512_set1_ps(min_val);
+    __m512 delta_vec = _mm512_set1_ps(delta);
+    
+    // Initialize sum accumulator
+    __m512 sum = _mm512_setzero_ps();
+
+    // Deal with remainder first
+    if constexpr (residual) {
+        if constexpr (residual < 16) {
+            // Handle less than 16 elements
+            __mmask16 mask = (1U << residual) - 1;
+            
+            // Load masked float elements
+            __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
+            
+            // Load masked uint8 elements
+            __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
+            __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+            __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+            
+            // Dequantize
+            __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+            
+            // Compute dot product
+            sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized);
+        } 
+        else if constexpr (residual == 16) {
+            // Handle exactly 16 elements
+            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+        }
+        else if constexpr (residual < 32) {
+            // Handle 16-31 elements: process 16 and then remainder
+            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+            
+            // Process remaining elements (residual - 16)
+            constexpr unsigned char remaining = residual - 16;
+            __mmask16 mask = (1U << remaining) - 1;
+            
+            // Load masked float elements
+            __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
+            
+            // Load masked uint8 elements
+            __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
+            __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+            __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+            
+            // Dequantize
+            __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+            
+            // Compute dot product
+            sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized);
+        }
+        else if constexpr (residual == 32) {
+            // Handle exactly 32 elements: process two chunks of 16
+            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+        }
+        else {
+            // Handle more than 32 elements: process chunks of 16 until less than 16 remain
+            constexpr size_t full_chunks = residual / 16;
+            for (size_t i = 0; i < full_chunks; i++) {
+                SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+            }
+            
+            // Process remaining elements (residual % 16)
+            constexpr unsigned char remaining = residual % 16;
+            if constexpr (remaining > 0) {
+                __mmask16 mask = (1U << remaining) - 1;
+                
+                // Load masked float elements
+                __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
+                
+                // Load masked uint8 elements
+                __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
+                __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+                __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+                
+                // Dequantize
+                __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+                
+                // Compute dot product
+                sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized);
+            }
+        }
+        
+        pVec1 += residual;
+        pVec2 += residual;
+    }
+    
+    // Process remaining full chunks of 16 elements
+    while (pVec2 < pEnd2) {
+        SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+    }
+    
+    // Horizontal sum
+    float result = _mm512_reduce_add_ps(sum);
+    
+    // Return 1 - result as per the pattern in other implementations
+    return result;
+}
+
+template <unsigned char residual> // 0..63
+float SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v,
+                                              const void *pVec2v,
+                                              size_t dimension) {
+    // Calculate inner product using common implementation
+    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+    std::cout << "result: " << ip << std::endl;
+    
+    // Return 1 - result as per the pattern in other implementations
+    return 1.0f - ip;
+}
+
+template <unsigned char residual> // 0..63
+float SQ8_CosineSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                         size_t dimension) {
+    // Calculate inner product using common implementation
+    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+    
+    // Get the inverse norm factor stored after min_val and delta
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVec2 + dimension + 2 * sizeof(float));
+    std::cout << "result2: " << ip << std::endl;
+    // Return 1 - (ip * inv_norm) as per the pattern in other implementations
+    return 1.0f - ip * inv_norm;
+}
+
diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
new file mode 100644
index 000000000..38c836652
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256,
+                                      const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+    
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+    pVect2 += 8;
+    
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+    
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+    
+    // Dequantize: (val * delta) + min_val
+    __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+    
+    // Compute dot product and add to sum
+    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant));
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *quantized = (uint8_t *)pVect2v;
+
+    // Get dequantization parameters from the end of quantized vector
+    float min = *(float *)(quantized + dimension);
+    float delta = *(float *)(quantized + dimension + sizeof(float));
+    
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        
+        // Load masked float elements
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+        
+        // Load masked uint8 elements
+        __m128i v2_128;
+        if constexpr (residual % 8 <= 4) {
+            // Load 4 or fewer bytes directly using unaligned loads and shifts
+            uint32_t temp = 0;
+            // Direct byte-by-byte loading to avoid memcpy
+            switch (residual % 8) {
+                case 4: temp |= (uint32_t)quantized[3] << 24;
+                case 3: temp |= (uint32_t)quantized[2] << 16;
+                case 2: temp |= (uint32_t)quantized[1] << 8;
+                case 1: temp |= quantized[0];
+            }
+            v2_128 = _mm_cvtsi32_si128(temp);
+        } else {
+            // Load 5-7 bytes directly using unaligned loads and shifts
+            uint64_t temp = 0;
+            // Direct byte-by-byte loading to avoid memcpy
+            switch (residual % 8) {
+                case 7: temp |= (uint64_t)quantized[6] << 48;
+                case 6: temp |= (uint64_t)quantized[5] << 40;
+                case 5: temp |= (uint64_t)quantized[4] << 32;
+                case 4: temp |= (uint64_t)quantized[3] << 24;
+                case 3: temp |= (uint64_t)quantized[2] << 16;
+                case 2: temp |= (uint64_t)quantized[1] << 8;
+                case 1: temp |= quantized[0];
+            }
+            v2_128 = _mm_cvtsi64_si128(temp);
+        }
+        quantized += residual % 8;
+        
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+        
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+        
+        // Dequantize: (val * delta) + min
+        __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+        
+        // Compute dot product with masking
+        sum256 = _mm256_mul_ps(v1, v2_dequant);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return 1.0f - my_mm256_reduce_add_ps(sum256);
+}
diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
new file mode 100644
index 000000000..deced094c
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <iostream>
+#include <string.h>
+
+static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum_prod,
+                                    const __m128 &min_val_vec, const __m128 &delta_vec) {
+    // Load 4 float elements from pVect1
+    __m128 v1 = _mm_loadu_ps(pVect1);
+    pVect1 += 4;
+    
+    // Load 4 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2)));
+    pVect2 += 4;
+    
+    // Convert int32 to float
+    __m128 v2_f = _mm_cvtepi32_ps(v2_i);
+    
+    // Dequantize: (val * delta) + min_val
+    __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
+    
+    // Compute dot product and add to sum
+    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant));
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *quantized = (uint8_t *)pVect2v;
+
+    // Get dequantization parameters from the end of quantized vector
+    float min = *(float *)(quantized + dimension);
+    float delta = *(float *)(quantized + dimension + sizeof(float));
+    
+    // Create broadcast vectors for SIMD operations
+    __m128 min_val_vec = _mm_set1_ps(min);
+    __m128 delta_vec = _mm_set1_ps(delta);
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m128 sum = _mm_setzero_ps();
+
+    // Process residual elements if needed
+    if constexpr (residual) {
+        // Handle residual elements (1-3)
+        if constexpr (residual % 4) {
+            __m128 v1;
+            __m128 v2_dequant = _mm_setzero_ps();
+            
+            if constexpr (residual % 4 == 3) {
+                // Load 3 floats and set the last one to 0
+                v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0
+                v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part
+                
+                // Dequantize first value
+                float dequant0 = quantized[0] * delta + min;
+                v2_dequant = _mm_load_ss(&dequant0);
+                
+                // Dequantize next two values
+                float dequant_high[2] = {
+                    quantized[1] * delta + min,
+                    quantized[2] * delta + min
+                };
+                v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high);
+                
+            } else if constexpr (residual % 4 == 2) {
+                // Load 2 floats and set the last two to 0
+                v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
+                
+                // Dequantize two values
+                float dequant_high[2] = {
+                    quantized[0] * delta + min,
+                    quantized[1] * delta + min
+                };
+                v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high);
+                
+            } else if constexpr (residual % 4 == 1) {
+                // Load 1 float and set the last three to 0
+                v1 = _mm_load_ss(pVect1);
+                
+                // Dequantize one value
+                float dequant0 = quantized[0] * delta + min;
+                v2_dequant = _mm_load_ss(&dequant0);
+            }
+            
+            pVect1 += residual % 4;
+            quantized += residual % 4;
+            sum = _mm_mul_ps(v1, v2_dequant);
+        }
+    }
+
+    // Process 4 elements at a time
+    while (pVect1 < pEnd1) {
+        InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+    }
+    
+    // TmpRes must be 16 bytes aligned.
+    float PORTABLE_ALIGN16 TmpRes[4];
+    _mm_store_ps(TmpRes, sum);
+    float result = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+
+    return result;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductSIMD16_SSE_IMP<residual>(pVect1v, pVect2v, dimension);
+}
+
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // Get quantization parameters
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+    
+    // Compute inner product with dequantization using the common function
+    // We need to cast away const for the inner product function, but it doesn't modify the vectors
+    const float res = SQ8_InnerProductSIMD16_SSE_IMP<residual>(pVect1v, pVect2v, dimension);
+    
+    std::cout << "res before normalization sse: " << res << std::endl;
+    std::cout << "inv_norm: " << inv_norm << std::endl;
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - res * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
new file mode 100644
index 000000000..d6c0faa3d
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+#include <iostream>
+#include <string.h>
+
+static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &offset,
+                                    svfloat32_t &sum, const svfloat32_t &min_val_vec, 
+                                    const svfloat32_t &delta_vec) {
+    svbool_t pg = svptrue_b32();
+    
+    // Load float elements from pVect1
+    svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
+    
+    // Load uint8 elements from pVect2, convert to int32, then to float
+    svbool_t pg_b8 = svptrue_b8();
+    svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset);
+    
+    // Convert uint8 to uint32
+    svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+    
+    // Convert uint32 to float32
+    svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
+    
+    // Dequantize: (val * delta) + min_val
+    svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec);
+    
+    // Compute dot product and add to sum
+    sum = svmla_f32_z(pg, sum, v1, v2_dequant);
+    
+    // Move to the next set of elements
+    offset += svcntw();
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *quantized = (uint8_t *)pVect2v;
+    size_t offset = 0;
+
+    // Get dequantization parameters from the end of quantized vector
+    float min = *(float *)(quantized + dimension);
+    float delta = *(float *)(quantized + dimension + sizeof(float));
+    
+    // Create broadcast vectors for SIMD operations
+    svbool_t pg = svptrue_b32();
+    svfloat32_t min_val_vec = svdup_f32(min);
+    svfloat32_t delta_vec = svdup_f32(delta);
+
+    // Get the number of 32-bit elements per vector at runtime
+    uint64_t sve_word_count = svcntw();
+    
+    // Multiple accumulators to increase instruction-level parallelism
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    // Handle partial chunk if needed
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % sve_word_count;
+        if (remaining > 0) {
+            // Create predicate for the remaining elements
+            svbool_t pg_partial = svwhilelt_b32(0, remaining);
+            
+            // Load float elements from pVect1 with predicate
+            svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
+            
+            // Load uint8 elements from pVect2 with predicate, convert to int32, then to float
+            svbool_t pg_b8_partial = svwhilelt_b8(0, remaining);
+            svuint8_t v2_u8 = svld1_u8(pg_b8_partial, quantized);
+            
+            // Convert uint8 to uint32
+            svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+            
+            // Convert uint32 to float32
+            svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
+            
+            // Dequantize: (val * delta) + min_val
+            svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
+            
+            // Compute dot product and add to sum
+            sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant);
+            
+            // Move pointers past the partial chunk
+            pVect1 += remaining;
+            quantized += remaining;
+        }
+    }
+
+    // Process 4 chunks at a time in the main loop
+    auto chunk_size = 4 * sve_word_count;
+    const size_t number_of_chunks = (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size;
+    
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, quantized, offset, sum3, min_val_vec, delta_vec);
+    }
+    
+    // Handle remaining steps (0-3)
+    if constexpr (additional_steps > 0) {
+        InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec);
+    }
+    if constexpr (additional_steps > 1) {
+        InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec);
+    }
+    if constexpr (additional_steps > 2) {
+        InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec);
+    }
+    
+    // Combine the accumulators
+    svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
+    sum = svadd_f32_z(pg, sum, sum2);
+    sum = svadd_f32_z(pg, sum, sum3);
+    
+    // Horizontal sum of all elements in the vector
+    float result = svaddv_f32(pg, sum);
+    
+    return result;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    
+    // Get quantization parameters
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+    
+    // Compute inner product with dequantization using the common function
+    const float res = SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+    
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - res * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index a74d2e59a..497605744 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -33,6 +33,122 @@ using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
 namespace spaces {
+    dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+        unsigned char dummy_alignment;
+        if (alignment == nullptr) {
+            alignment = &dummy_alignment;
+        }
+
+        dist_func_t<float> ret_dist_func = SQ8_InnerProduct;
+        [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+    #ifdef CPU_FEATURES_ARCH_AARCH64
+
+    #ifdef OPT_SVE2
+        if (features.sve2) {
+            return Choose_SQ8_IP_implementation_SVE2(dim);
+        }
+    #endif
+    #ifdef OPT_SVE
+        if (features.sve) {
+            return Choose_SQ8_IP_implementation_SVE(dim);
+        }
+    #endif
+    // #ifdef OPT_NEON
+    //     if (features.asimd) {
+    //         return Choose_SQ8_IP_implementation_NEON(dim);
+    //     }
+    // #endif
+
+    #endif
+
+    #ifdef CPU_FEATURES_ARCH_X86_64
+        // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+        if (dim < 16) {
+            return ret_dist_func;
+        }
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
+        if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+            if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 16 * sizeof(float); // handles 16 floats
+            return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+        }
+    #endif
+    #ifdef OPT_AVX
+        if (features.avx) {
+            if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 8 * sizeof(float); // handles 8 floats
+            return Choose_SQ8_IP_implementation_AVX(dim);
+        }
+    #endif
+    #ifdef OPT_SSE
+        if (features.sse) {
+            if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 4 * sizeof(float); // handles 4 floats
+            return Choose_SQ8_IP_implementation_SSE(dim);
+        }
+    #endif
+    #endif // __x86_64__
+        return ret_dist_func;
+    }
+
+dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+        unsigned char dummy_alignment;
+        if (alignment == nullptr) {
+            alignment = &dummy_alignment;
+        }
+
+        dist_func_t<float> ret_dist_func = SQ8_Cosine;
+        [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+    #ifdef CPU_FEATURES_ARCH_AARCH64
+
+    #ifdef OPT_SVE2
+        if (features.sve2) {
+            return Choose_SQ8_Cosine_implementation_SVE2(dim);
+        }
+    #endif
+    #ifdef OPT_SVE
+        if (features.sve) {
+            return Choose_SQ8_Cosine_implementation_SVE(dim);
+        }
+    #endif
+    #ifdef OPT_NEON
+        if (features.asimd) {
+            return Choose_SQ8_Cosine_implementation_NEON(dim);
+        }
+    #endif
+
+    #endif
+
+    #ifdef CPU_FEATURES_ARCH_X86_64
+        // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+        if (dim < 16) {
+            return ret_dist_func;
+        }
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
+        if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+            if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 16 * sizeof(float); // handles 16 floats
+            return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+        }
+    #endif
+    // #ifdef OPT_AVX
+    //     if (features.avx) {
+    //         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+    //             *alignment = 8 * sizeof(float); // handles 8 floats
+    //         return Choose_SQ8_Cosine_implementation_AVX(dim);
+    //     }
+    // #endif
+    #ifdef OPT_SSE
+        if (features.sse) {
+            if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 4 * sizeof(float); // handles 4 floats
+            return Choose_SQ8_Cosine_implementation_SSE(dim);
+        }
+    #endif
+    #endif // __x86_64__
+        return ret_dist_func;
+    }
+
 dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
     unsigned char dummy_alignment;
     if (alignment == nullptr) {
diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h
index 70aee6244..e375e8e37 100644
--- a/src/VecSim/spaces/IP_space.h
+++ b/src/VecSim/spaces/IP_space.h
@@ -5,11 +5,14 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #pragma once
 #include "VecSim/spaces/spaces.h"
 
 namespace spaces {
+dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                      const void *arch_opt = nullptr);
+
 dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                        const void *arch_opt = nullptr);
 dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
@@ -26,4 +29,6 @@ dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = n
                                         const void *arch_opt = nullptr);
 dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                             const void *arch_opt = nullptr);
+dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                           const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 395c158b8..08ea8674c 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -14,6 +14,25 @@
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
+float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // pvect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it.
+    // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)]
+    // The last two values are used to dequantize the vector.
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    float res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        auto dequantized_normalized_V2 = (pVect2[i] * delta + min_val) * inv_norm;
+        float t = pVect1[i] - dequantized_normalized_V2;
+        res += t * t;
+    }
+    return res;
+}
+
 float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float *vec1 = (float *)pVect1v;
     float *vec2 = (float *)pVect2v;
diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h
index b3ac4d4c7..055e8c630 100644
--- a/src/VecSim/spaces/L2/L2.h
+++ b/src/VecSim/spaces/L2/L2.h
@@ -10,6 +10,8 @@
 
 #include <cstdlib>
 
+float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
+
 float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
 
 double FP64_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
new file mode 100644
index 000000000..e4cf82c45
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+*/
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void L2SqrStep(float *&pVect1, float *&pVect2, __m256 &sum) {
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+    __m256 v2 = _mm256_loadu_ps(pVect2);
+    pVect2 += 8;
+    __m256 diff = _mm256_sub_ps(v1, v2);
+    // sum = _mm256_fmadd_ps(diff, diff, sum);
+    sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+}
+
+template <unsigned char residual> // 0..15
+float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    float *pVect2 = (float *)pVect2v;
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m256 sum = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask8 = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask8>(pVect1);
+        pVect1 += residual % 8;
+        __m256 v2 = my_mm256_maskz_loadu_ps<mask8>(pVect2);
+        pVect2 += residual % 8;
+        __m256 diff = _mm256_sub_ps(v1, v2);
+        sum = _mm256_mul_ps(diff, diff);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        L2SqrStep(pVect1, pVect2, sum);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        L2SqrStep(pVect1, pVect2, sum);
+        L2SqrStep(pVect1, pVect2, sum);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum);
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 5304f1f86..488e2fe5a 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -33,6 +33,64 @@ using float16 = vecsim_types::float16;
 
 namespace spaces {
 
+    dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+        unsigned char dummy_alignment;
+        if (!alignment) {
+            alignment = &dummy_alignment;
+        }
+    
+        dist_func_t<float> ret_dist_func = SQ8_L2Sqr;
+    
+        [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+    // #ifdef CPU_FEATURES_ARCH_AARCH64
+    // #ifdef OPT_SVE2
+    //     if (features.sve2) {
+    //         return Choose_FP32_L2_implementation_SVE2(dim);
+    //     }
+    // #endif
+    // #ifdef OPT_SVE
+    //     if (features.sve) {
+    //         return Choose_FP32_L2_implementation_SVE(dim);
+    //     }
+    // #endif
+    // #ifdef OPT_NEON
+    //     if (features.asimd) {
+    //         return Choose_FP32_L2_implementation_NEON(dim);
+    //     }
+    // #endif
+    // #endif
+    
+    // #ifdef CPU_FEATURES_ARCH_X86_64
+    //     // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+    
+    //     if (dim < 16) {
+    //         return ret_dist_func;
+    //     }
+    // #ifdef OPT_AVX512F
+    //     if (features.avx512f) {
+    //         if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+    //             *alignment = 16 * sizeof(float); // handles 16 floats
+    //         return Choose_SQ8_L2_implementation_AVX512F(dim);
+    //     }
+    // #endif
+    // #ifdef OPT_AVX
+    //     if (features.avx) {
+    //         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+    //             *alignment = 8 * sizeof(float); // handles 8 floats
+    //         return Choose_SQ8_L2_implementation_AVX(dim);
+    //     }
+    // #endif
+    // #ifdef OPT_SSE
+    //     if (features.sse) {
+    //         if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
+    //             *alignment = 4 * sizeof(float); // handles 4 floats
+    //         return Choose_SQ8_L2_implementation_SSE(dim);
+    //     }
+    // #endif
+    // #endif // __x86_64__
+        return ret_dist_func;
+    }
+
 dist_func_t<float> L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
     unsigned char dummy_alignment;
     if (!alignment) {
diff --git a/src/VecSim/spaces/computer/preprocessors.h b/src/VecSim/spaces/computer/preprocessors.h
index 1045299b4..ae434ea69 100644
--- a/src/VecSim/spaces/computer/preprocessors.h
+++ b/src/VecSim/spaces/computer/preprocessors.h
@@ -111,3 +111,134 @@ class CosinePreprocessor : public PreprocessorInterface {
     spaces::normalizeVector_f<DataType> normalize_func;
     const size_t dim;
 };
+
+template <typename DataType>
+class QuantPreprocessor : public PreprocessorInterface {
+public:
+    QuantPreprocessor(std::shared_ptr<VecSimAllocator> allocator, size_t dim, size_t bits_per_dim = 8)
+        : PreprocessorInterface(allocator), dim(dim), bits_per_dim(bits_per_dim),
+          compressed_bytes_count(calculateCompressedSize(dim)) {}
+
+    void preprocess(const void *original_blob, void *&storage_blob, void *&query_blob,
+                    size_t processed_bytes_count, unsigned char alignment) const override {
+        // Case 1: Blobs are different (one might be null, or both are allocated and processed separately)
+        if (storage_blob != query_blob) {
+            // Process storage blob (compress)
+            if (storage_blob == nullptr) {
+                storage_blob = this->allocator->allocate(compressed_bytes_count);
+                quantize(original_blob, storage_blob);
+            }
+            
+            // Query blob remains uncompressed
+            if (query_blob == nullptr) {
+                query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment);
+                memcpy(query_blob, original_blob, processed_bytes_count);
+            }
+        } else { // Case 2: Blobs are the same or both null
+            if (query_blob == nullptr) {
+                // For query, we keep the original format
+                query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment);
+                memcpy(query_blob, original_blob, processed_bytes_count);
+                
+                // For storage, we compress
+                storage_blob = this->allocator->allocate(compressed_bytes_count);
+                quantize(original_blob, storage_blob);
+            } else {
+                // If both point to the same memory, we need to separate them
+                void* new_storage = this->allocator->allocate(compressed_bytes_count);
+                quantize(query_blob, new_storage);
+                storage_blob = new_storage;
+            }
+        }
+    }
+
+    void preprocessForStorage(const void *original_blob, void *&blob,
+                              size_t processed_bytes_count) const override {
+        if (blob == nullptr) {
+            blob = this->allocator->allocate(compressed_bytes_count);
+            quantize(original_blob, blob);
+        } else {
+            // If blob is already allocated, we need to compress in-place
+            void* temp = this->allocator->allocate(compressed_bytes_count);
+            quantize(blob, temp);
+            this->allocator->free_allocation(blob);
+            blob = temp;
+        }
+    }
+
+    void preprocessQuery(const void *original_blob, void *&blob, size_t processed_bytes_count,
+                         unsigned char alignment) const override {
+        // For query, we keep the original format
+        if (blob == nullptr) {
+            blob = this->allocator->allocate_aligned(processed_bytes_count, alignment);
+            memcpy(blob, original_blob, processed_bytes_count);
+        }
+    }
+
+    void preprocessQueryInPlace(void *blob, size_t processed_bytes_count,
+                                unsigned char alignment) const override {
+        // No compression for query vectors
+        assert(blob);
+    }
+
+    void preprocessStorageInPlace(void *blob, size_t processed_bytes_count) const override {
+        assert(blob);
+        // Create temporary storage for compressed data
+        void* temp = this->allocator->allocate(compressed_bytes_count);
+        quantize(blob, temp);
+        
+        // Copy compressed data back to original location
+        // Note: This assumes blob has enough space for the compressed data
+        memcpy(blob, temp, compressed_bytes_count);
+        this->allocator->free_allocation(temp);
+    }
+
+private:
+    const size_t dim;
+    const size_t bits_per_dim;
+    const size_t compressed_bytes_count;
+
+    // Calculate the size needed for the compressed vector
+    static size_t calculateCompressedSize(size_t dim) {
+        // Quantized values (int8 per dimension) + min (float32) + delta (float32)
+        return dim * sizeof(int8_t) + 2 * sizeof(float);
+    }
+
+    // Quantize the vector from original format to compressed format
+    void quantize(const void *src, void *dst) const {
+        const DataType* src_data = static_cast<const DataType*>(src);
+        
+        // Find min and max values in the vector
+        DataType min_val = src_data[0];
+        DataType max_val = src_data[0];
+        
+        for (size_t i = 0; i < dim; i++) {
+            DataType val = src_data[i];
+            min_val = val < min_val ? val : min_val;
+            max_val = val > max_val ? val : max_val;
+        }
+        
+        // Calculate delta (quantization step)
+        float delta = (max_val - min_val) / 255.0f;
+        if (delta == 0){
+            delta = 1.0f; // Avoid division by zero if all values are the same
+        }
+        
+        // Structure of compressed data:
+        // [quantized values (int8_t * dim)][min_val (float)][delta (float)]
+        int8_t* quant_values = static_cast<int8_t*>(dst); // convert to int8_t pointer
+        float* params = reinterpret_cast<float*>(quant_values + dim); // convert to float pointer starting after quantized values
+        
+        // Store min and delta values for dequantization
+        params[0] = static_cast<float>(min_val);
+        params[1] = delta;
+        
+        // Quantize each value
+        for (size_t i = 0; i < dim; i++) {
+            float normalized = (src_data[i] - min_val) / delta;
+            if (normalized < 0) normalized = 0;
+            if (normalized > 255) normalized = 255;
+            quant_values[i] = static_cast<int8_t>(normalized);
+        }
+    }
+};
diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp
index 7033a7c70..d0e5b6fbe 100644
--- a/src/VecSim/spaces/functions/AVX.cpp
+++ b/src/VecSim/spaces/functions/AVX.cpp
@@ -11,6 +11,7 @@
 #include "VecSim/spaces/L2/L2_AVX_FP32.h"
 #include "VecSim/spaces/L2/L2_AVX_FP64.h"
 
+#include "VecSim/spaces/IP/IP_AVX_SQ8.h"
 #include "VecSim/spaces/IP/IP_AVX_FP32.h"
 #include "VecSim/spaces/IP/IP_AVX_FP64.h"
 
@@ -18,6 +19,12 @@ namespace spaces {
 
 #include "implementation_chooser.h"
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX);
+    return ret_dist_func;
+}
+
 dist_func_t<float> Choose_FP32_IP_implementation_AVX(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX);
diff --git a/src/VecSim/spaces/functions/AVX.h b/src/VecSim/spaces/functions/AVX.h
index 16b1e4f85..7f2c38b1f 100644
--- a/src/VecSim/spaces/functions/AVX.h
+++ b/src/VecSim/spaces/functions/AVX.h
@@ -12,6 +12,7 @@
 
 namespace spaces {
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX(size_t dim);
 dist_func_t<float> Choose_FP32_IP_implementation_AVX(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_AVX(size_t dim);
 
diff --git a/src/VecSim/spaces/functions/AVX2.cpp b/src/VecSim/spaces/functions/AVX2.cpp
index bd1997a23..5e0bde6c8 100644
--- a/src/VecSim/spaces/functions/AVX2.cpp
+++ b/src/VecSim/spaces/functions/AVX2.cpp
@@ -10,6 +10,7 @@
 
 #include "VecSim/spaces/IP/IP_AVX2_BF16.h"
 #include "VecSim/spaces/L2/L2_AVX2_BF16.h"
+#include "VecSim/spaces/IP/IP_AVX2_SQ8.h"
 
 namespace spaces {
 
@@ -27,6 +28,12 @@ dist_func_t<float> Choose_BF16_L2_implementation_AVX2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2.h b/src/VecSim/spaces/functions/AVX2.h
index 8ad04a8a5..06b0269de 100644
--- a/src/VecSim/spaces/functions/AVX2.h
+++ b/src/VecSim/spaces/functions/AVX2.h
@@ -14,5 +14,6 @@ namespace spaces {
 
 dist_func_t<float> Choose_BF16_IP_implementation_AVX2(size_t dim);
 dist_func_t<float> Choose_BF16_L2_implementation_AVX2(size_t dim);
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2(size_t dim);
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 9ef8e0efd..ffa62375d 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -14,10 +14,13 @@
 #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h"
 #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h"
 
+#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h"
+
 namespace spaces {
 
 #include "implementation_chooser.h"
 
+
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI);
@@ -54,6 +57,17 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_CosineSIMD64_AVX512_BW_VL_VNNI);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index 384e2549b..b6760eca9 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -12,6 +12,8 @@
 
 namespace spaces {
 
+
+
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
@@ -20,4 +22,8 @@ dist_func_t<float> Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim)
 dist_func_t<float> Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SSE.cpp b/src/VecSim/spaces/functions/SSE.cpp
index 8962306db..dd218d957 100644
--- a/src/VecSim/spaces/functions/SSE.cpp
+++ b/src/VecSim/spaces/functions/SSE.cpp
@@ -13,11 +13,24 @@
 
 #include "VecSim/spaces/IP/IP_SSE_FP32.h"
 #include "VecSim/spaces/IP/IP_SSE_FP64.h"
+#include "VecSim/spaces/IP/IP_SSE_SQ8.h"
 
 namespace spaces {
 
 #include "implementation_chooser.h"
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SSE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_SSE);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SSE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_SSE);
+    return ret_dist_func;
+}
+
 dist_func_t<float> Choose_FP32_IP_implementation_SSE(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_SSE);
diff --git a/src/VecSim/spaces/functions/SSE.h b/src/VecSim/spaces/functions/SSE.h
index ab09de7d6..a86921a9c 100644
--- a/src/VecSim/spaces/functions/SSE.h
+++ b/src/VecSim/spaces/functions/SSE.h
@@ -12,6 +12,8 @@
 
 namespace spaces {
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SSE(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SSE(size_t dim);
 dist_func_t<float> Choose_FP32_IP_implementation_SSE(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_SSE(size_t dim);
 
diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp
index fd80512b1..39098bd8c 100644
--- a/src/VecSim/spaces/functions/SVE.cpp
+++ b/src/VecSim/spaces/functions/SVE.cpp
@@ -22,6 +22,7 @@
 
 #include "VecSim/spaces/L2/L2_SVE_UINT8.h"
 #include "VecSim/spaces/IP/IP_SVE_UINT8.h"
+#include "VecSim/spaces/IP/IP_SVE_SQ8.h"
 
 namespace spaces {
 
@@ -96,6 +97,18 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h
index a98613449..86f7a7094 100644
--- a/src/VecSim/spaces/functions/SVE.h
+++ b/src/VecSim/spaces/functions/SVE.h
@@ -29,4 +29,7 @@ dist_func_t<float> Choose_UINT8_L2_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_UINT8_IP_implementation_SVE(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp
index 4758150d0..52ba020a4 100644
--- a/src/VecSim/spaces/functions/SVE2.cpp
+++ b/src/VecSim/spaces/functions/SVE2.cpp
@@ -20,6 +20,7 @@
 #include "VecSim/spaces/IP/IP_SVE_INT8.h"  // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_SQ8.h"   // SVE2 implementation is identical to SVE
 
 namespace spaces {
 
@@ -94,6 +95,18 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h
index 248ca710b..cd3570caf 100644
--- a/src/VecSim/spaces/functions/SVE2.h
+++ b/src/VecSim/spaces/functions/SVE2.h
@@ -29,4 +29,7 @@ dist_func_t<float> Choose_UINT8_L2_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_UINT8_IP_implementation_SVE2(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE2(size_t dim);
+
 } // namespace spaces
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
index a5c9e7257..8a207228a 100644
--- a/tests/benchmark/CMakeLists.txt
+++ b/tests/benchmark/CMakeLists.txt
@@ -38,7 +38,7 @@ endif()
 # Spaces benchmarks								                                          #
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8)
+set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8)
 foreach(data_type IN LISTS DATA_TYPE)
 	add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp)
 	target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark)
diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh
index 78584130e..76389ad89 100755
--- a/tests/benchmark/benchmarks.sh
+++ b/tests/benchmark/benchmarks.sh
@@ -15,6 +15,7 @@ if [ -z "$BM_TYPE"  ] || [ "$BM_TYPE" = "benchmarks-all" ]; then
     echo spaces_fp16
     echo spaces_int8
     echo spaces_uint8
+    echo spaces_sq8
 
 elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo basics_single_fp32
@@ -25,6 +26,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo spaces_fp16
     echo spaces_int8
     echo spaces_uint8
+    echo spaces_sq8
 
 
 # Basic benchmarks
@@ -91,6 +93,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then
     echo spaces_bf16
     echo spaces_int8
     echo spaces_uint8
+    echo spaces_sq8
 
 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then
     echo spaces_fp32
@@ -104,4 +107,6 @@ elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then
     echo spaces_int8
 elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then
     echo spaces_uint8
+elif [ "$BM_TYPE" = "bm-spaces-sq8" ] ; then
+    echo spaces_sq8
 fi
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
new file mode 100644
index 000000000..197765e85
--- /dev/null
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+*/
+#include "bm_spaces.h"
+#include "utils/tests_utils.h"
+
+class BM_VecSimSpaces_SQ8 : public benchmark::Fixture {
+    protected:
+        std::mt19937 rng;
+        size_t dim;
+        float *v1;
+        uint8_t *v2;
+    
+    public:
+    BM_VecSimSpaces_SQ8() { rng.seed(47); }
+        ~BM_VecSimSpaces_SQ8() = default;
+    
+        void SetUp(const ::benchmark::State &state) {
+            dim = state.range(0);
+            v1 = new float[dim];
+            test_utils::populate_float_vec(v1, dim, 123);
+            // Allocate vector with extra space for min, delta and cosine calculations
+            v2 = new uint8_t[dim + sizeof(float) * 3];
+            test_utils::populate_float_vec_to_sq8(v2, dim, 1234);
+        }
+        void TearDown(const ::benchmark::State &state) {
+            delete v1;
+            delete v2;
+        }
+    };
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+cpu_features::X86Features opt = cpu_features::GetX86Info().features;
+
+// AVX512_F_BW_VL_VNNI functions
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
+INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32,
+                                avx512_f_bw_vl_vnni_supported);
+// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
+//                                  avx512_f_bw_vl_vnni_supported);
+#endif // AVX512_F_BW_VL_VNNI
+
+#ifdef AVX2
+// AVX2 functions
+bool avx2_supported = opt.avx2;
+INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 32, avx2_supported);
+// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX2, 32,
+//                                  avx2_supported);
+#endif // AVX2
+
+// AVX functions
+#ifdef OPT_AVX
+bool avx_supported = opt.avx;
+INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX, 32, avx_supported);
+// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX, 32,
+//                                  avx_supported);
+#endif // AVX
+// SSE functions
+#ifdef OPT_SSE
+bool sse_supported = opt.sse;
+INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, SSE, 32, sse_supported);
+// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE, 32,
+//                                   sse_supported);
+#endif // SSE
+#endif // x86_64
+
+// Naive algorithms
+
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16);
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16);
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16);
+
+// Naive
+
+BENCHMARK_MAIN();
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 2310782f4..2cf61cea8 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 
 #include <utility>
 #include <random>
@@ -305,6 +305,177 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) {
     ASSERT_NEAR(dist, 0.0, 0.000001);
 }
 
+void common_ip_sq8(bool should_normalize, float expected_dist) {
+
+    size_t dim = 5;
+
+    // Create original vectors
+    float v1_orig[dim], v2_orig[dim];
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i + 1.5);
+    }
+
+    // Create SQ8 compressed version of v2
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    if (should_normalize) {
+        spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+        spaces::GetNormalizeFunc<float>()(v2_orig, dim);
+    }
+
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    std::vector<uint8_t> v2_compressed(compressed_size);
+
+    // Quantize v2
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
+
+    float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
+
+    // Since we're comparing identical vectors, the inner product distance should be close to
+    // expected
+    ASSERT_NEAR(dist, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance";
+}
+
+/* ======================== Tests SQ8 ========================= */
+TEST_F(SpacesTest, SQ8_ip_no_optimization_func_test) {
+    float expected_dist = -70.2147f; // Expected distance for identical vectors
+    common_ip_sq8(false, expected_dist);
+}
+
+TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, 0.0f); }
+
+TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) {
+    // create a vector with extra space for the norm
+    size_t dim = 5;
+
+    // Create original vectors
+    float v1_orig[dim], v2_orig[dim];
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i + 1.5);
+    }
+
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    // Compress v2
+    std::vector<uint8_t> v2_compressed(compressed_size);
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
+    // Calculate inverse norm from decompressed values
+    float inv_norm = 0.0f;
+    for (size_t i = 0; i < dim; i++) {
+        float decompressed_value = min_val + quant_values[i] * delta;
+        inv_norm += decompressed_value * decompressed_value;
+    }
+    inv_norm = 1.0f / std::sqrt(inv_norm);
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+
+    float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
+    ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance";
+}
+TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
+    // create a vector with extra space for the norm
+    size_t dim = 5;
+
+    // Create original vectors
+    float v1_orig[dim], v2_orig[dim];
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i + 1.5);
+    }
+
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    // Compress v2
+    std::vector<uint8_t> v2_compressed(compressed_size);
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
+    // Calculate inverse norm from decompressed values
+    float inv_norm = 0.0f;
+    for (size_t i = 0; i < dim; i++) {
+        float decompressed_value = min_val + quant_values[i] * delta;
+        inv_norm += decompressed_value * decompressed_value;
+    }
+    inv_norm = 1.0f / std::sqrt(inv_norm);
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+    std::cout << "min_val: " << min_val << ", delta: " << delta << ", inv_norm: " << inv_norm
+              << std::endl;
+
+    float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
+    ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance";
+}
+
 /* ======================== Test Getters ======================== */
 
 TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) {
@@ -1889,3 +2060,364 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) {
 
 INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest,
                          testing::Range(32UL, 64 * 2UL + 1));
+
+// Helper function to create SQ8 compressed vector
+std::vector<uint8_t> CreateSQ8CompressedVector(const float *original, size_t dim) {
+    // Create a copy of the original vector that we can modify
+    std::vector<float> vec_copy(original, original + dim);
+
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    std::vector<uint8_t> compressed(compressed_size);
+
+    // Find min and max for quantization
+    float min_val = vec_copy[0];
+    float max_val = vec_copy[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, vec_copy[i]);
+        max_val = std::max(max_val, vec_copy[i]);
+    }
+
+    // Calculate delta
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    // Quantize vector
+    uint8_t *quant_values = compressed.data();
+    float norm = 0.0f;
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (vec_copy[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+        norm += (quant_values[i] * delta + min_val) * (quant_values[i] * delta + min_val);
+    }
+
+    float inv_norm = 1.0f / std::sqrt(norm);
+    // Store parameters
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+
+    return compressed;
+}
+
+class SQ8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
+
+// TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
+//     auto optimization = getCpuOptimizationFeatures();
+//     size_t dim = GetParam();
+
+//     // Create original vectors
+//     std::vector<float> v1_orig(dim);
+//     std::vector<float> v2_orig(dim);
+//     for (size_t i = 0; i < dim; i++) {
+//         v1_orig[i] = float(i + 1.5);
+//         v2_orig[i] = float(i * 0.75 + 1.0);
+//     }
+
+//     // Create SQ8 compressed version of v2
+//     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim, false);
+
+//     auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+//         size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
+//         return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
+//     };
+
+//     dist_func_t<float> arch_opt_func;
+//     float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
+
+//     // Test different optimizations based on CPU features
+//     #ifdef OPT_AVX512_F_BW_VL_VNNI
+//     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+//     optimization.avx512vnni) {
+//         unsigned char alignment = 0;
+//         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+//         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
+//             << "Unexpected distance function chosen for dim " << dim;
+//         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+//             << "AVX512 with dim " << dim;
+//         ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
+//         // Unset optimizations flag, so we'll choose the next optimization.
+//         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+//         optimization.avx512vnni = 0;
+//     }
+//     #endif
+
+//     // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
+
+//     // Test default implementation
+//     unsigned char alignment = 0;
+//     arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+//     ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
+//     ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+//         << "No optimization with dim " << dim;
+//     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+// }
+
+TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    // Create original vectors
+    std::vector<float> v1_orig(dim);
+    std::vector<float> v2_orig(dim);
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i * 0.75 + 1.0);
+    }
+    spaces::GetNormalizeFunc<float>()(v1_orig.data(), dim);
+    // spaces::GetNormalizeFunc<float>()(v2_orig.data(), dim);
+    // print v1_orig
+    std::cout << "v1_orig: ";
+    for (size_t i = 0; i < dim; i++) {
+        std::cout << v1_orig[i] << ", ";
+    }
+    std::cout << std::endl;
+    std::cout << "v2_orig: ";
+    for (size_t i = 0; i < dim; i++) {
+        std::cout << v2_orig[i] << ", ";
+    }
+    std::cout << std::endl;
+
+    // Create SQ8 compressed version of v2
+    std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
+    // print min and delta
+    float *params = reinterpret_cast<float *>(v2_compressed.data() + dim);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
+
+    // Test different optimizations based on CPU features
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
+        optimization.avx512f = 0;
+    }
+    #endif
+    #ifdef OPT_AVX
+    if (optimization.avx) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
+        optimization.avx = 0;
+    }
+    #endif
+    #ifdef OPT_SSE
+    if (optimization.sse) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SSE with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
+        optimization.sse = 0;
+    }
+    #endif
+
+
+    // Test default implementation
+    unsigned char alignment = 0;
+    arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_InnerProduct)
+        << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+// Instantiate the test suite with dimensions to test
+INSTANTIATE_TEST_SUITE_P(SQ8InnerProductTest, SQ8SpacesOptimizationTest,
+                         testing::Range(16UL, 16 * 2UL + 1));
+
+TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    // Create original vectors
+    std::vector<float> v1_orig(dim);
+    std::vector<float> v2_orig(dim);
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i * 0.75 + 1.0);
+    }
+
+    // Normalize v1
+    spaces::GetNormalizeFunc<float>()(v1_orig.data(), dim);
+    spaces::GetNormalizeFunc<float>()(v2_orig.data(), dim);
+
+    // Create SQ8 compressed version of v2 (with normalization)
+    std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim);
+
+    #ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        // We don't align SQ8 vectors with cosine distance
+        // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+    #endif
+    #ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        // We don't align SQ8 vectors with cosine distance
+        // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+    #endif
+
+    // Test different optimizations based on CPU features
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        // We don't align SQ8 vectors with cosine distance
+        // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim;
+        optimization.avx512f = 0;
+    }
+    #endif
+
+    #ifdef OPT_SSE
+    if (optimization.sse) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SSE with dim " << dim;
+        // We don't align SQ8 vectors with cosine distance
+        // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim;
+        optimization.sse = 0;
+    }
+    #endif
+
+    // Test default implementation
+    unsigned char alignment = 0;
+    arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " <<
+    dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+// Instantiate the test suite with dimensions to test
+INSTANTIATE_TEST_SUITE_P(SQ8CosineTest, SQ8SpacesOptimizationTest,
+    testing::Range(16UL, 16 * 2UL + 1));
+
+// TEST_P(SQ8SpacesOptimizationTest, SQ8_full_range_test) {
+//     auto optimization = getCpuOptimizationFeatures();
+//     constexpr size_t dim = 512;
+
+//     // Create vectors with full range of values
+//     std::vector<float> v1(dim);
+//     std::vector<float> v2(dim);
+
+//     // v1: 0..255 followed by 255..0
+//     for (size_t i = 0; i < 256; i++) {
+//         v1[i] = static_cast<float>(i) / 255.0f;
+//         v1[256 + i] = static_cast<float>(255 - i) / 255.0f;
+//     }
+
+//     // v2: 255..0 followed by 0..255
+//     for (size_t i = 0; i < 256; i++) {
+//         v2[i] = static_cast<float>(255 - i) / 255.0f;
+//         v2[256 + i] = static_cast<float>(i) / 255.0f;
+//     }
+
+//     // Create SQ8 compressed version of v2
+//     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2.data(), dim, false);
+
+//     // Create normalized version of v1 for cosine
+//     std::vector<float> v1_norm(v1);
+//     spaces::GetNormalizeFunc<float>()(v1_norm.data(), dim);
+
+//     // Create normalized SQ8 compressed version of v2 for cosine
+//     std::vector<uint8_t> v2_compressed_norm = CreateSQ8CompressedVector(v2.data(), dim, true);
+
+//     float baseline_l2 = SQ8_L2Sqr(v1.data(), v2_compressed.data(), dim);
+//     float baseline_ip = SQ8_InnerProduct(v1.data(), v2_compressed.data(), dim);
+//     float baseline_cosine = SQ8_Cosine(v1_norm.data(), v2_compressed_norm.data(), dim);
+
+//     dist_func_t<float> arch_opt_func;
+
+//     // Test different optimizations for each metric
+//     #ifdef OPT_AVX512F
+//     if (optimization.avx512f) {
+//         // L2 test
+//         arch_opt_func = Choose_SQ8_L2_implementation_AVX512F(dim);
+//         ASSERT_NEAR(baseline_l2, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01)
+//             << "L2 AVX512 with dim " << dim;
+
+//         // IP test
+//         arch_opt_func = Choose_SQ8_IP_implementation_AVX512F(dim);
+//         ASSERT_NEAR(baseline_ip, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01)
+//             << "IP AVX512 with dim " << dim;
+
+//         // Cosine test
+//         arch_opt_func = Choose_SQ8_Cosine_implementation_AVX512F(dim);
+//         ASSERT_NEAR(baseline_cosine, arch_opt_func(v1_norm.data(), v2_compressed_norm.data(),
+//         dim), 0.01)
+//             << "Cosine AVX512 with dim " << dim;
+
+//         optimization.avx512f = 0;
+//     }
+//     #endif
+
+//     // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
+
+
+// Instantiate the test suite with dimensions to test
+INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest,
+                         testing::Range(16UL, 16 * 2UL + 1));
+
+// #endif // defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64)
+
+// class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
+
+// TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
+//     auto optimization = getCpuOptimizationFeatures();
+//     size_t dim = GetParam();
+//     int8_t v1[dim];
+//     int8_t v2[dim];
+//     test_utils::populate_int8_vec(v1, dim
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index a1526867b..bb041b0af 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -40,6 +40,57 @@ static void populate_uint8_vec(uint8_t *v, size_t dim, int seed = 1234) {
     }
 }
 
+static void populate_float_vec(float *v, size_t dim, int seed = 1234) {
+
+    std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
+    std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
+
+    for (size_t i = 0; i < dim; i++) {
+        v[i] = dis(gen);
+    }
+}
+
+static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) {
+
+    std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
+    std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
+    std::vector<float> vec_copy(dim);
+    for (size_t i = 0; i < dim; i++) {
+        vec_copy[i] = dis(gen);
+    }
+
+    // Find min and max for quantization
+    float min_val = vec_copy[0];
+    float max_val = vec_copy[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, vec_copy[i]);
+        max_val = std::max(max_val, vec_copy[i]);
+    }
+
+    // Calculate delta
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    float norm = 0.0f;
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (vec_copy[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        v[i] = static_cast<uint8_t>(std::round(normalized));
+        norm += (v[i] * delta + min_val) * (v[i] * delta + min_val);
+    }
+
+    float inv_norm = 1.0f / std::sqrt(norm);
+    // Store parameters
+    float *params = reinterpret_cast<float *>(v + dim);
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+
+}
+
+
 template <typename datatype>
 float integral_compute_norm(const datatype *vec, size_t dim) {
     return spaces::IntegralType_ComputeNorm<datatype>(vec, dim);

From af854320511463f81dc9e9e0d474840253d932ff Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:15:41 +0300
Subject: [PATCH 02/52] Change to IP_AVX512F

---
 .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h     | 176 ------------------
 src/VecSim/spaces/IP/IP_AVX512F_SQ8.h         | 139 ++++++++++++++
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |   8 +-
 tests/unit/test_spaces.cpp                    |  11 +-
 4 files changed, 148 insertions(+), 186 deletions(-)
 delete mode 100644 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h
 create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_SQ8.h

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h
deleted file mode 100644
index 6c001efcf..000000000
--- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2006-Present, Redis Ltd.
- * All rights reserved.
- *
- * Licensed under your choice of the Redis Source Available License 2.0
- * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
- * GNU Affero General Public License v3 (AGPLv3).
- */
-#pragma once
-#include "VecSim/spaces/space_includes.h"
-#include <immintrin.h>
-#include <iostream>
-
-static inline void
-SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum,
-                     const __m512 &min_val_vec, const __m512 &delta_vec) {
-    // Load 16 float elements from pVec1
-    __m512 v1 = _mm512_loadu_ps(pVec1);
-    
-    // Load 16 uint8 elements from pVec2 and convert to __m512i
-    __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2);
-    __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
-    
-    // Convert uint8 to float
-    __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
-    
-    // Dequantize: (val * delta) + min_val
-    __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
-    
-    // Compute dot product and add to sum
-    sum = _mm512_fmadd_ps(v1, dequantized, sum);
-    
-    // Advance pointers
-    pVec1 += 16;
-    pVec2 += 16;
-}
-
-// Common implementation for both inner product and cosine similarity
-template <unsigned char residual> // 0..63
-float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) {
-    const float *pVec1 = static_cast<const float *>(pVec1v);
-    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
-    const uint8_t *pEnd2 = pVec2 + dimension;
-    
-    // Get dequantization parameters from the end of pVec2
-    const float min_val = *reinterpret_cast<const float *>(pVec2 + dimension);
-    const float delta = *reinterpret_cast<const float *>(pVec2 + dimension + sizeof(float));
-    
-    // Create broadcast vectors for SIMD operations
-    __m512 min_val_vec = _mm512_set1_ps(min_val);
-    __m512 delta_vec = _mm512_set1_ps(delta);
-    
-    // Initialize sum accumulator
-    __m512 sum = _mm512_setzero_ps();
-
-    // Deal with remainder first
-    if constexpr (residual) {
-        if constexpr (residual < 16) {
-            // Handle less than 16 elements
-            __mmask16 mask = (1U << residual) - 1;
-            
-            // Load masked float elements
-            __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
-            
-            // Load masked uint8 elements
-            __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
-            __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
-            __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
-            
-            // Dequantize
-            __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
-            
-            // Compute dot product
-            sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized);
-        } 
-        else if constexpr (residual == 16) {
-            // Handle exactly 16 elements
-            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-        }
-        else if constexpr (residual < 32) {
-            // Handle 16-31 elements: process 16 and then remainder
-            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-            
-            // Process remaining elements (residual - 16)
-            constexpr unsigned char remaining = residual - 16;
-            __mmask16 mask = (1U << remaining) - 1;
-            
-            // Load masked float elements
-            __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
-            
-            // Load masked uint8 elements
-            __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
-            __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
-            __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
-            
-            // Dequantize
-            __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
-            
-            // Compute dot product
-            sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized);
-        }
-        else if constexpr (residual == 32) {
-            // Handle exactly 32 elements: process two chunks of 16
-            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-            SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-        }
-        else {
-            // Handle more than 32 elements: process chunks of 16 until less than 16 remain
-            constexpr size_t full_chunks = residual / 16;
-            for (size_t i = 0; i < full_chunks; i++) {
-                SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-            }
-            
-            // Process remaining elements (residual % 16)
-            constexpr unsigned char remaining = residual % 16;
-            if constexpr (remaining > 0) {
-                __mmask16 mask = (1U << remaining) - 1;
-                
-                // Load masked float elements
-                __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
-                
-                // Load masked uint8 elements
-                __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
-                __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
-                __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
-                
-                // Dequantize
-                __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
-                
-                // Compute dot product
-                sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized);
-            }
-        }
-        
-        pVec1 += residual;
-        pVec2 += residual;
-    }
-    
-    // Process remaining full chunks of 16 elements
-    while (pVec2 < pEnd2) {
-        SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-    }
-    
-    // Horizontal sum
-    float result = _mm512_reduce_add_ps(sum);
-    
-    // Return 1 - result as per the pattern in other implementations
-    return result;
-}
-
-template <unsigned char residual> // 0..63
-float SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v,
-                                              const void *pVec2v,
-                                              size_t dimension) {
-    // Calculate inner product using common implementation
-    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
-    std::cout << "result: " << ip << std::endl;
-    
-    // Return 1 - result as per the pattern in other implementations
-    return 1.0f - ip;
-}
-
-template <unsigned char residual> // 0..63
-float SQ8_CosineSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
-                                         size_t dimension) {
-    // Calculate inner product using common implementation
-    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
-    
-    // Get the inverse norm factor stored after min_val and delta
-    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
-    const float inv_norm = *reinterpret_cast<const float *>(pVec2 + dimension + 2 * sizeof(float));
-    std::cout << "result2: " << ip << std::endl;
-    // Return 1 - (ip * inv_norm) as per the pattern in other implementations
-    return 1.0f - ip * inv_norm;
-}
-
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
new file mode 100644
index 000000000..7005b7a15
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <immintrin.h>
+#include <iostream>
+
+static inline void
+SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum,
+                     const __m512 &min_val_vec, const __m512 &delta_vec) {
+    // Load 16 float elements from pVec1
+    __m512 v1 = _mm512_loadu_ps(pVec1);
+
+    // Load 16 uint8 elements from pVec2 and convert to __m512i
+    __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2);
+    __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+
+    // Convert uint8 to float
+    __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+    // Dequantize: (val * delta) + min_val
+    __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Compute dot product and add to sum
+    sum = _mm512_fmadd_ps(v1, dequantized, sum);
+
+    // Advance pointers
+    pVec1 += 16;
+    pVec2 += 16;
+}
+
+// Common implementation for both inner product and cosine similarity
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) {
+    const float *pVec1 = static_cast<const float *>(pVec1v);
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+    const uint8_t *pEnd2 = pVec2 + dimension;
+
+    // Get dequantization parameters from the end of pVec2
+    const float min_val = *reinterpret_cast<const float *>(pVec2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVec2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    __m512 min_val_vec = _mm512_set1_ps(min_val);
+    __m512 delta_vec = _mm512_set1_ps(delta);
+
+    // Initialize sum accumulator
+    __m512 sum = _mm512_setzero_ps();
+
+    // Deal with remainder first
+    if constexpr (residual > 0) {
+        // Handle less than 16 elements
+        __mmask16 mask = (1U << residual) - 1;
+
+        // Load masked float elements
+        __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
+
+        // Load masked uint8 elements
+        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
+        __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+        __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+
+        // Dequantize
+        __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+        
+        // Compute dot product
+        __m512 product = _mm512_mul_ps(v1, dequantized);
+
+        
+        // Apply mask to product and add to sum
+        sum = _mm512_mask_add_ps(sum, mask, sum, product);
+        
+        pVec1 += residual;
+        pVec2 += residual;
+    }
+    
+    // Print and compare the residual sums
+    float simd_residual_sum = _mm512_reduce_add_ps(sum);
+    std::cout << "Residual part - SIMD sum: " << simd_residual_sum 
+              << ", Naive sum: " << naive_sum 
+              << ", Difference: " << std::abs(simd_residual_sum - naive_sum) << std::endl;
+    
+    // Process remaining full chunks of 16 elements
+    while (pVec2 <= pEnd2) {
+        SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+    }
+
+    // Horizontal sum
+    float result = _mm512_reduce_add_ps(sum);
+
+    // Calculate full naive sum for comparison
+    float full_naive_sum = naive_sum;
+    const float *orig_pVec1 = static_cast<const float *>(pVec1v) + residual;
+    const uint8_t *orig_pVec2 = static_cast<const uint8_t *>(pVec2v) + residual;
+    for (size_t i = 0; i < dimension - residual; i++) {
+        float dequantized = orig_pVec2[i] * delta + min_val;
+        full_naive_sum += orig_pVec1[i] * dequantized;
+    }
+    
+    std::cout << "Full calculation - SIMD sum: " << result 
+              << ", Naive sum: " << full_naive_sum 
+              << ", Difference: " << std::abs(result - full_naive_sum) << std::endl;
+
+    // Return the raw inner product result
+    return result;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX512F(const void *pVec1v,
+                                              const void *pVec2v,
+                                              size_t dimension) {
+    // Calculate inner product using common implementation
+    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+    
+    // The inner product similarity is 1 - ip
+    return 1.0f - ip;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v,
+                                         size_t dimension) {
+    // Get the inverse norm factor stored after min_val and delta
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVec2 + dimension + 2 * sizeof(float));
+    
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension, inv_norm);
+    
+    // The cosine similarity is 1 - ip
+    return 1.0f - ip;
+}
+
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index ffa62375d..d06a68d02 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -14,7 +14,7 @@
 #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h"
 #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h"
 
-#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h"
+#include "VecSim/spaces/IP/IP_AVX512F_SQ8.h"
 
 namespace spaces {
 
@@ -57,14 +57,14 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t
     return ret_dist_func;
 }
 
-dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F);
     return ret_dist_func;
 }
 dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_CosineSIMD64_AVX512_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F);
     return ret_dist_func;
 }
 
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 2cf61cea8..6859fe30b 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2169,9 +2169,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         v2_orig[i] = float(i * 0.75 + 1.0);
     }
     spaces::GetNormalizeFunc<float>()(v1_orig.data(), dim);
-    // spaces::GetNormalizeFunc<float>()(v2_orig.data(), dim);
     // print v1_orig
-    std::cout << "v1_orig: ";
+    std::cout << "v1_normalized: ";
     for (size_t i = 0; i < dim; i++) {
         std::cout << v1_orig[i] << ", ";
     }
@@ -2196,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
 
     // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
-    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) {
+    #ifdef OPT_AVX512F
+    if (optimization.avx512f) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
@@ -2303,12 +2302,12 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
     #endif
 
     // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
+    #ifdef OPT_AVX512F
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
         optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;

From b215799aaeaf61c492a01a3ca0e00ae09b898dcf Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:17:12 +0300
Subject: [PATCH 03/52] Change

---
 src/VecSim/spaces/IP/IP_AVX512F_SQ8.h | 28 +++------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
index 7005b7a15..c179aa0e9 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
@@ -81,35 +81,13 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi
         pVec2 += residual;
     }
     
-    // Print and compare the residual sums
-    float simd_residual_sum = _mm512_reduce_add_ps(sum);
-    std::cout << "Residual part - SIMD sum: " << simd_residual_sum 
-              << ", Naive sum: " << naive_sum 
-              << ", Difference: " << std::abs(simd_residual_sum - naive_sum) << std::endl;
-    
     // Process remaining full chunks of 16 elements
-    while (pVec2 <= pEnd2) {
+    do {
         SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-    }
-
-    // Horizontal sum
-    float result = _mm512_reduce_add_ps(sum);
-
-    // Calculate full naive sum for comparison
-    float full_naive_sum = naive_sum;
-    const float *orig_pVec1 = static_cast<const float *>(pVec1v) + residual;
-    const uint8_t *orig_pVec2 = static_cast<const uint8_t *>(pVec2v) + residual;
-    for (size_t i = 0; i < dimension - residual; i++) {
-        float dequantized = orig_pVec2[i] * delta + min_val;
-        full_naive_sum += orig_pVec1[i] * dequantized;
-    }
-    
-    std::cout << "Full calculation - SIMD sum: " << result 
-              << ", Naive sum: " << full_naive_sum 
-              << ", Difference: " << std::abs(result - full_naive_sum) << std::endl;
+    } while (pVec1 < pEnd2);
 
     // Return the raw inner product result
-    return result;
+    return _mm512_reduce_add_ps(sum);;
 }
 
 template <unsigned char residual> // 0..15

From 8b4188b01450500d3e166a3450d2652b7ac92b3e Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:18:04 +0300
Subject: [PATCH 04/52] vec1

---
 src/VecSim/spaces/IP/IP_AVX512F_SQ8.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
index c179aa0e9..8c32ca6a7 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
@@ -40,7 +40,7 @@ template <unsigned char residual> // 0..15
 float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) {
     const float *pVec1 = static_cast<const float *>(pVec1v);
     const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
-    const uint8_t *pEnd2 = pVec2 + dimension;
+    const uint8_t *pEnd1 = pVec1 + dimension;
 
     // Get dequantization parameters from the end of pVec2
     const float min_val = *reinterpret_cast<const float *>(pVec2 + dimension);
@@ -84,7 +84,7 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi
     // Process remaining full chunks of 16 elements
     do {
         SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
-    } while (pVec1 < pEnd2);
+    } while (pVec1 < pEnd1);
 
     // Return the raw inner product result
     return _mm512_reduce_add_ps(sum);;

From a1d1a162f3e3df1eda4c05dc72fe49c9dffa0060 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:18:36 +0300
Subject: [PATCH 05/52] float

---
 src/VecSim/spaces/IP/IP_AVX512F_SQ8.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
index 8c32ca6a7..36b2d0ff4 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
@@ -40,7 +40,7 @@ template <unsigned char residual> // 0..15
 float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) {
     const float *pVec1 = static_cast<const float *>(pVec1v);
     const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
-    const uint8_t *pEnd1 = pVec1 + dimension;
+    const float *pEnd1 = pVec1 + dimension;
 
     // Get dequantization parameters from the end of pVec2
     const float min_val = *reinterpret_cast<const float *>(pVec2 + dimension);

From b5860bbc61a6f7abc5e6565a93524b658dec1e72 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:19:58 +0300
Subject: [PATCH 06/52] finish

---
 src/VecSim/spaces/IP_space.cpp                   | 6 +++---
 src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h | 4 ++--
 tests/unit/test_spaces.cpp                       | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 497605744..fc1b18aa9 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -66,11 +66,11 @@ namespace spaces {
         if (dim < 16) {
             return ret_dist_func;
         }
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
-        if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+    #ifdef OPT_AVX512F
+        if (features.avx512f) {
             if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
                 *alignment = 16 * sizeof(float); // handles 16 floats
-            return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+            return Choose_SQ8_IP_implementation_AVX512F(dim);
         }
     #endif
     #ifdef OPT_AVX
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index b6760eca9..c44dfe635 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -22,8 +22,8 @@ dist_func_t<float> Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim)
 dist_func_t<float> Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
-dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
-dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F(size_t dim);
 
 
 } // namespace spaces
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 6859fe30b..fa840655f 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2199,7 +2199,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     if (optimization.avx512f) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;

From 0d07d718f35f9a196ce48b7f9523fb82239f61fa Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:20:42 +0300
Subject: [PATCH 07/52] now

---
 src/VecSim/spaces/IP_space.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index fc1b18aa9..07b849f0c 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -124,11 +124,11 @@ dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
         if (dim < 16) {
             return ret_dist_func;
         }
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
-        if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+    #ifdef OPT_AVX512F
+        if (features.avx512f) {
             if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
                 *alignment = 16 * sizeof(float); // handles 16 floats
-            return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+            return Choose_SQ8_Cosine_implementation_AVX512F(dim);
         }
     #endif
     // #ifdef OPT_AVX

From 66c49e8b3dd4b8f2eb1764c64befb3ff290b9e39 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:21:25 +0300
Subject: [PATCH 08/52] remove Choose_SQ8_Cosine_implementation_AVX512F

---
 src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index d06a68d02..3ce3b46ad 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -62,7 +62,7 @@ dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F(size_t dim) {
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F);
     return ret_dist_func;
 }
-dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F);
     return ret_dist_func;

From aa26c717782351b857ebec52ba02780bd5ae0bd0 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:22:49 +0300
Subject: [PATCH 09/52] in test

---
 tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
index 197765e85..e5f457d9c 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -39,8 +39,8 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
 // AVX512_F_BW_VL_VNNI functions
 #ifdef OPT_AVX512_F_BW_VL_VNNI
-bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
-INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32,
+bool avx512_f_bw_vl_vnni_supported = opt.avx512f;
+INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F, 32,
                                 avx512_f_bw_vl_vnni_supported);
 // INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
 //                                  avx512_f_bw_vl_vnni_supported);

From 43b58a8e370c10d124bcd5ac84979335532a39c1 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:23:46 +0300
Subject: [PATCH 10/52] alignemnt

---
 tests/unit/test_spaces.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index fa840655f..a251cb28b 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2203,7 +2203,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;
-        ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
     #endif

From 1e12fa34153904c032fe28f42c809f7909a48702 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:35:13 +0300
Subject: [PATCH 11/52] back to bw

---
 ...512F_SQ8.h => IP_AVX512F_SQ8_BW_VL_VNNI.h} |  4 ++--
 src/VecSim/spaces/IP/IP_AVX_SQ8.h             | 22 +++++++++++++++++--
 src/VecSim/spaces/IP_space.cpp                | 12 +++++-----
 src/VecSim/spaces/functions/AVX512F.cpp       |  1 +
 src/VecSim/spaces/functions/AVX512F.h         |  3 +++
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   | 10 ++++-----
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     |  5 ++---
 7 files changed, 39 insertions(+), 18 deletions(-)
 rename src/VecSim/spaces/IP/{IP_AVX512F_SQ8.h => IP_AVX512F_SQ8_BW_VL_VNNI.h} (96%)

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
similarity index 96%
rename from src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
rename to src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
index 36b2d0ff4..b33b3629c 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
@@ -91,7 +91,7 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi
 }
 
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductSIMD16_AVX512F(const void *pVec1v,
+float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v,
                                               const void *pVec2v,
                                               size_t dimension) {
     // Calculate inner product using common implementation
@@ -102,7 +102,7 @@ float SQ8_InnerProductSIMD16_AVX512F(const void *pVec1v,
 }
 
 template <unsigned char residual> // 0..15
-float SQ8_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v,
+float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
                                          size_t dimension) {
     // Get the inverse norm factor stored after min_val and delta
     const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
index 38c836652..b68de3c4d 100644
--- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
@@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256
 }
 
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
+float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float *pVect1 = (float *)pVect1v;
     uint8_t *quantized = (uint8_t *)pVect2v;
 
@@ -112,5 +112,23 @@ float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_
         InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
     } while (pVect1 < pEnd1);
 
-    return 1.0f - my_mm256_reduce_add_ps(sum256);
+    return my_mm256_reduce_add_ps(sum256);
+}
+
+float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return SQ8_InnerProductImp<0>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Get dequantization parameters from the end of quantized vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+    
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - ip * inv_norm;
 }
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 07b849f0c..3ba81a92e 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -66,11 +66,11 @@ namespace spaces {
         if (dim < 16) {
             return ret_dist_func;
         }
-    #ifdef OPT_AVX512F
-        if (features.avx512f) {
+    #ifdef OPT_AVX512F_BW_VL_VNNI
+        if (features.avx512f && features.avx512bw && features.avx512vnni) {
             if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
                 *alignment = 16 * sizeof(float); // handles 16 floats
-            return Choose_SQ8_IP_implementation_AVX512F(dim);
+            return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
         }
     #endif
     #ifdef OPT_AVX
@@ -124,11 +124,11 @@ dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
         if (dim < 16) {
             return ret_dist_func;
         }
-    #ifdef OPT_AVX512F
-        if (features.avx512f) {
+    #ifdef OPT_AVX512F_BW_VL_VNNI
+        if (features.avx512f && features.avx512bw && features.avx512vnni) {
             if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
                 *alignment = 16 * sizeof(float); // handles 16 floats
-            return Choose_SQ8_Cosine_implementation_AVX512F(dim);
+            return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
         }
     #endif
     // #ifdef OPT_AVX
diff --git a/src/VecSim/spaces/functions/AVX512F.cpp b/src/VecSim/spaces/functions/AVX512F.cpp
index bcddbea91..c9124f3b4 100644
--- a/src/VecSim/spaces/functions/AVX512F.cpp
+++ b/src/VecSim/spaces/functions/AVX512F.cpp
@@ -16,6 +16,7 @@
 #include "VecSim/spaces/IP/IP_AVX512F_FP32.h"
 #include "VecSim/spaces/IP/IP_AVX512F_FP64.h"
 
+
 namespace spaces {
 
 #include "implementation_chooser.h"
diff --git a/src/VecSim/spaces/functions/AVX512F.h b/src/VecSim/spaces/functions/AVX512F.h
index 9a9e9b48a..cce00f0f1 100644
--- a/src/VecSim/spaces/functions/AVX512F.h
+++ b/src/VecSim/spaces/functions/AVX512F.h
@@ -20,4 +20,7 @@ dist_func_t<float> Choose_FP16_L2_implementation_AVX512F(size_t dim);
 dist_func_t<float> Choose_FP32_L2_implementation_AVX512F(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_AVX512F(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 3ce3b46ad..76809f6b5 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -14,7 +14,7 @@
 #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h"
 #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h"
 
-#include "VecSim/spaces/IP/IP_AVX512F_SQ8.h"
+#include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h"
 
 namespace spaces {
 
@@ -57,14 +57,14 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t
     return ret_dist_func;
 }
 
-dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F(size_t dim) {
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
-dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F(size_t dim) {
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index c44dfe635..e2d587ef0 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -22,8 +22,7 @@ dist_func_t<float> Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim)
 dist_func_t<float> Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
-dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F(size_t dim);
-dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F(size_t dim);
-
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
 } // namespace spaces

From 984a0305adf2af668f8df2fd2b817d8fb83f026e Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:38:49 +0300
Subject: [PATCH 12/52] back again

---
 src/VecSim/spaces/IP/IP_AVX_SQ8.h                   | 3 ++-
 tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 5 +++--
 tests/unit/test_spaces.cpp                          | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
index b68de3c4d..c917f7787 100644
--- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
@@ -115,8 +115,9 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
     return my_mm256_reduce_add_ps(sum256);
 }
 
+template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return SQ8_InnerProductImp<0>(pVect1v, pVect2v, dimension);
+    return SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
 }
 
 template <unsigned char residual> // 0..15
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
index e5f457d9c..cbf0b7e5b 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -39,8 +39,9 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
 // AVX512_F_BW_VL_VNNI functions
 #ifdef OPT_AVX512_F_BW_VL_VNNI
-bool avx512_f_bw_vl_vnni_supported = opt.avx512f;
-INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F, 32,
+bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw &&
+                                   opt.avx512vl && opt.avx512vnni;
+INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32,
                                 avx512_f_bw_vl_vnni_supported);
 // INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
 //                                  avx512_f_bw_vl_vnni_supported);
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index a251cb28b..7d4ddfdd0 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2199,7 +2199,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     if (optimization.avx512f) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F(dim))
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;
@@ -2307,7 +2307,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F(dim))
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;

From c3670a85a497123c0f448fb8f6f676b0ceac0d8b Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:41:46 +0300
Subject: [PATCH 13/52] again

---
 tests/unit/test_spaces.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 7d4ddfdd0..7ad503c0f 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2195,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
 
     // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512F
-    if (optimization.avx512f) {
+    #ifdef OPT_AVX512F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
@@ -2303,8 +2303,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
 
     // Test different optimizations based on CPU features
     #ifdef OPT_AVX512F
-    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
-        optimization.avx512vnni) {
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))

From 11303b7e89e43f344051b513c0ab2ad52cfc591b Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:42:49 +0300
Subject: [PATCH 14/52] optimization

---
 tests/unit/test_spaces.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 7ad503c0f..8994db979 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2196,7 +2196,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
 
     // Test different optimizations based on CPU features
     #ifdef OPT_AVX512F_BW_VL_VNNI
-    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
@@ -2303,7 +2303,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
 
     // Test different optimizations based on CPU features
     #ifdef OPT_AVX512F
-    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))

From 7474c059513c8756212e23313f51069b206c2342 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:43:53 +0300
Subject: [PATCH 15/52] more BW

---
 tests/unit/test_spaces.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 8994db979..aaa1c7ef8 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2302,7 +2302,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
     #endif
 
     // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512F
+    #ifdef OPT_AVX512F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);

From 2cfd9b699547b83869b670f73b6f7e005f8fd986 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:55:11 +0300
Subject: [PATCH 16/52] fix avx

---
 src/VecSim/spaces/IP/IP_AVX_SQ8.h | 66 +++++++++----------------------
 1 file changed, 19 insertions(+), 47 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
index c917f7787..2fbd4401f 100644
--- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
@@ -9,7 +9,7 @@
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
-static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256,
+static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
                                       const __m256 &min_val_vec, const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
@@ -34,58 +34,30 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *quantized = (uint8_t *)pVect2v;
-
-    // Get dequantization parameters from the end of quantized vector
-    float min = *(float *)(quantized + dimension);
-    float delta = *(float *)(quantized + dimension + sizeof(float));
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
     
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
     // Create broadcast vectors for SIMD operations
-    __m256 min_val_vec = _mm256_set1_ps(min);
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
     __m256 delta_vec = _mm256_set1_ps(delta);
 
-    const float *pEnd1 = pVect1 + dimension;
-
     __m256 sum256 = _mm256_setzero_ps();
 
-    // Deal with 1-7 floats with mask loading, if needed
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
     if constexpr (residual % 8) {
         __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
-        
-        // Load masked float elements
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
         
-        // Load masked uint8 elements
-        __m128i v2_128;
-        if constexpr (residual % 8 <= 4) {
-            // Load 4 or fewer bytes directly using unaligned loads and shifts
-            uint32_t temp = 0;
-            // Direct byte-by-byte loading to avoid memcpy
-            switch (residual % 8) {
-                case 4: temp |= (uint32_t)quantized[3] << 24;
-                case 3: temp |= (uint32_t)quantized[2] << 16;
-                case 2: temp |= (uint32_t)quantized[1] << 8;
-                case 1: temp |= quantized[0];
-            }
-            v2_128 = _mm_cvtsi32_si128(temp);
-        } else {
-            // Load 5-7 bytes directly using unaligned loads and shifts
-            uint64_t temp = 0;
-            // Direct byte-by-byte loading to avoid memcpy
-            switch (residual % 8) {
-                case 7: temp |= (uint64_t)quantized[6] << 48;
-                case 6: temp |= (uint64_t)quantized[5] << 40;
-                case 5: temp |= (uint64_t)quantized[4] << 32;
-                case 4: temp |= (uint64_t)quantized[3] << 24;
-                case 3: temp |= (uint64_t)quantized[2] << 16;
-                case 2: temp |= (uint64_t)quantized[1] << 8;
-                case 1: temp |= quantized[0];
-            }
-            v2_128 = _mm_cvtsi64_si128(temp);
-        }
-        quantized += residual % 8;
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+        pVect2 += residual % 8;
         
         // Zero-extend uint8 to int32
         __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
@@ -93,7 +65,7 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
         // Convert int32 to float
         __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
         
-        // Dequantize: (val * delta) + min
+        // Dequantize: (val * delta) + min_val
         __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
         
         // Compute dot product with masking
@@ -102,14 +74,14 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
 
     // If the reminder is >=8, have another step of 8 floats
     if constexpr (residual >= 8) {
-        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
     }
 
     // We dealt with the residual part. We are left with some multiple of 16 floats.
     // In each iteration we calculate 16 floats = 512 bits.
     do {
-        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
-        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
     } while (pVect1 < pEnd1);
 
     return my_mm256_reduce_add_ps(sum256);
@@ -117,7 +89,7 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    return 1.0f - SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
 }
 
 template <unsigned char residual> // 0..15

From 3cdf05ee4658c8990d44e1a7f585848b0a4b461b Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:56:25 +0300
Subject: [PATCH 17/52] add avx cosine test

---
 tests/unit/test_spaces.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index aaa1c7ef8..5cf9655c0 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2315,6 +2315,18 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         optimization.avx512f = 0;
     }
     #endif
+    #ifdef OPT_AVX
+    if (optimization.avx) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // We don't align SQ8 vectors with cosine distance
+        // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
+        optimization.avx = 0;
+    }
 
     #ifdef OPT_SSE
     if (optimization.sse) {

From fc8bc7ded3d05e93951d05f24e78ab53a5e2a8d6 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 13:58:30 +0300
Subject: [PATCH 18/52] avx

---
 src/VecSim/spaces/IP_space.cpp     | 14 +++++++-------
 src/VecSim/spaces/functions/AVX.h  |  2 ++
 src/VecSim/spaces/functions/AVX2.h |  1 -
 tests/unit/test_spaces.cpp         |  1 +
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 3ba81a92e..f3d3dc07e 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -131,13 +131,13 @@ dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
             return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
         }
     #endif
-    // #ifdef OPT_AVX
-    //     if (features.avx) {
-    //         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-    //             *alignment = 8 * sizeof(float); // handles 8 floats
-    //         return Choose_SQ8_Cosine_implementation_AVX(dim);
-    //     }
-    // #endif
+    #ifdef OPT_AVX
+        if (features.avx) {
+            if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 8 * sizeof(float); // handles 8 floats
+            return Choose_SQ8_Cosine_implementation_AVX(dim);
+        }
+    #endif
     #ifdef OPT_SSE
         if (features.sse) {
             if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
diff --git a/src/VecSim/spaces/functions/AVX.h b/src/VecSim/spaces/functions/AVX.h
index 7f2c38b1f..ccdede166 100644
--- a/src/VecSim/spaces/functions/AVX.h
+++ b/src/VecSim/spaces/functions/AVX.h
@@ -13,6 +13,8 @@
 namespace spaces {
 
 dist_func_t<float> Choose_SQ8_IP_implementation_AVX(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX(size_t dim);
+
 dist_func_t<float> Choose_FP32_IP_implementation_AVX(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_AVX(size_t dim);
 
diff --git a/src/VecSim/spaces/functions/AVX2.h b/src/VecSim/spaces/functions/AVX2.h
index 06b0269de..8ad04a8a5 100644
--- a/src/VecSim/spaces/functions/AVX2.h
+++ b/src/VecSim/spaces/functions/AVX2.h
@@ -14,6 +14,5 @@ namespace spaces {
 
 dist_func_t<float> Choose_BF16_IP_implementation_AVX2(size_t dim);
 dist_func_t<float> Choose_BF16_L2_implementation_AVX2(size_t dim);
-dist_func_t<float> Choose_SQ8_IP_implementation_AVX2(size_t dim);
 
 } // namespace spaces
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 5cf9655c0..307bf6c5c 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2327,6 +2327,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
         optimization.avx = 0;
     }
+    #endif
 
     #ifdef OPT_SSE
     if (optimization.sse) {

From 513839b639bd77c70a46f7d925964d315e250d98 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 14:00:19 +0300
Subject: [PATCH 19/52] add impl

---
 src/VecSim/spaces/functions/AVX.cpp  | 6 ++++++
 src/VecSim/spaces/functions/AVX2.cpp | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp
index d0e5b6fbe..33ef7b4dc 100644
--- a/src/VecSim/spaces/functions/AVX.cpp
+++ b/src/VecSim/spaces/functions/AVX.cpp
@@ -25,6 +25,12 @@ dist_func_t<float> Choose_SQ8_IP_implementation_AVX(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX);
+    return ret_dist_func;
+}
+
 dist_func_t<float> Choose_FP32_IP_implementation_AVX(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX);
diff --git a/src/VecSim/spaces/functions/AVX2.cpp b/src/VecSim/spaces/functions/AVX2.cpp
index 5e0bde6c8..b7df68ce9 100644
--- a/src/VecSim/spaces/functions/AVX2.cpp
+++ b/src/VecSim/spaces/functions/AVX2.cpp
@@ -28,12 +28,6 @@ dist_func_t<float> Choose_BF16_L2_implementation_AVX2(size_t dim) {
     return ret_dist_func;
 }
 
-dist_func_t<float> Choose_SQ8_IP_implementation_AVX2(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2);
-    return ret_dist_func;
-}
-
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces

From f676c1bbc8593e10ae697dc3ebe4adf41ef3eca5 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 14:14:12 +0300
Subject: [PATCH 20/52] add l2

---
 .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h     | 98 +++++++++++++++++++
 src/VecSim/spaces/L2_space.cpp                | 26 ++---
 src/VecSim/spaces/L2_space.h                  |  4 +-
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |  6 ++
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     |  1 +
 tests/unit/test_spaces.cpp                    | 86 ++++++++--------
 6 files changed, 163 insertions(+), 58 deletions(-)
 create mode 100644 src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h

diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
new file mode 100644
index 000000000..448388932
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+*/
+#include "VecSim/spaces/space_includes.h"
+
+// Helper function to perform L2 squared distance calculation for a chunk of 16 elements
+static inline void
+SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum,
+              const __m512 &min_val_vec, const __m512 &delta_vec, const __m512 &inv_norm_vec) {
+    // Load 16 float elements from pVect1
+    __m512 v1 = _mm512_loadu_ps(pVect1);
+
+    // Load 16 uint8 elements from pVect2 and convert to __m512i
+    __m128i v2_128 = _mm_loadu_si128((__m128i*)pVect2);
+    __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+
+    // Convert uint8 to float
+    __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+    // Dequantize: (val * delta + min_val) * inv_norm
+    __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+    dequantized = _mm512_mul_ps(dequantized, inv_norm_vec);
+
+    // Compute difference
+    __m512 diff = _mm512_sub_ps(v1, dequantized);
+
+    // Square difference and add to sum
+    sum = _mm512_fmadd_ps(diff, diff, sum);
+
+    // Advance pointers
+    pVect1 += 16;
+    pVect2 += 16;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                          size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of pVect2
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    __m512 min_val_vec = _mm512_set1_ps(min_val);
+    __m512 delta_vec = _mm512_set1_ps(delta);
+    __m512 inv_norm_vec = _mm512_set1_ps(inv_norm);
+
+    // Initialize sum accumulator
+    __m512 sum = _mm512_setzero_ps();
+    
+    // Handle residual elements (0 to 15)
+    if constexpr (residual > 0) {
+        // Create mask for residual elements
+        __mmask16 mask = (1U << residual) - 1;
+
+        // Load masked float elements from pVect1
+        __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1);
+
+        // Load masked uint8 elements from pVect2
+        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVect2));
+        __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+        __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+        // Dequantize: (val * delta + min_val) * inv_norm
+        __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+        dequantized = _mm512_mul_ps(dequantized, inv_norm_vec);
+
+        // Compute difference
+        __m512 diff = _mm512_sub_ps(v1, dequantized);
+
+        // Square difference and add to sum (with mask)
+        __m512 squared = _mm512_mul_ps(diff, diff);
+        sum = _mm512_mask_add_ps(sum, mask, sum, squared);
+
+        // Advance pointers
+        pVect1 += residual;
+        pVect2 += residual;
+    }
+
+    // Process remaining full chunks of 16 elements
+    do  {
+        SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec, inv_norm_vec);
+    }while (pVect1 < pEnd1);
+
+    // Horizontal sum
+    float result = _mm512_reduce_add_ps(sum);
+    
+    return result;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 488e2fe5a..ff9976fe0 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -60,19 +60,19 @@ namespace spaces {
     // #endif
     // #endif
     
-    // #ifdef CPU_FEATURES_ARCH_X86_64
-    //     // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+    #ifdef CPU_FEATURES_ARCH_X86_64
+        // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
     
-    //     if (dim < 16) {
-    //         return ret_dist_func;
-    //     }
-    // #ifdef OPT_AVX512F
-    //     if (features.avx512f) {
-    //         if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-    //             *alignment = 16 * sizeof(float); // handles 16 floats
-    //         return Choose_SQ8_L2_implementation_AVX512F(dim);
-    //     }
-    // #endif
+        if (dim < 16) {
+            return ret_dist_func;
+        }
+    #ifdef OPT_AVX512F_BW_VL_VNNI
+        if (features.avx512f && features.avx512bw && features.avx512vnni) {
+            if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 16 * sizeof(float); // handles 16 floats
+            return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
+        }
+    #endif
     // #ifdef OPT_AVX
     //     if (features.avx) {
     //         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
@@ -87,7 +87,7 @@ namespace spaces {
     //         return Choose_SQ8_L2_implementation_SSE(dim);
     //     }
     // #endif
-    // #endif // __x86_64__
+    #endif // __x86_64__
         return ret_dist_func;
     }
 
diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h
index 1bdd52473..a58fcd7e4 100644
--- a/src/VecSim/spaces/L2_space.h
+++ b/src/VecSim/spaces/L2_space.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #pragma once
 #include "VecSim/spaces/spaces.h"
 
@@ -22,4 +22,6 @@ dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> L2_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                         const void *arch_opt = nullptr);
+dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                      const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 76809f6b5..889725204 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -15,6 +15,7 @@
 #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h"
 
 #include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h"
+#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h"
 
 namespace spaces {
 
@@ -67,6 +68,11 @@ dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t di
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
 
 #include "implementation_chooser_cleanup.h"
 
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index e2d587ef0..77eff5d57 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -24,5 +24,6 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t
 
 dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
 } // namespace spaces
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 307bf6c5c..664d78f25 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2106,56 +2106,54 @@ std::vector<uint8_t> CreateSQ8CompressedVector(const float *original, size_t dim
 
 class SQ8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 
-// TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
-//     auto optimization = getCpuOptimizationFeatures();
-//     size_t dim = GetParam();
+TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
 
-//     // Create original vectors
-//     std::vector<float> v1_orig(dim);
-//     std::vector<float> v2_orig(dim);
-//     for (size_t i = 0; i < dim; i++) {
-//         v1_orig[i] = float(i + 1.5);
-//         v2_orig[i] = float(i * 0.75 + 1.0);
-//     }
+    // Create original vectors
+    std::vector<float> v1_orig(dim);
+    std::vector<float> v2_orig(dim);
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i * 0.75 + 1.0);
+    }
 
-//     // Create SQ8 compressed version of v2
-//     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim, false);
+    // Create SQ8 compressed version of v2
+    std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
 
-//     auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
-//         size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
-//         return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
-//     };
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
+    };
 
-//     dist_func_t<float> arch_opt_func;
-//     float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
-
-//     // Test different optimizations based on CPU features
-//     #ifdef OPT_AVX512_F_BW_VL_VNNI
-//     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
-//     optimization.avx512vnni) {
-//         unsigned char alignment = 0;
-//         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
-//         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
-//             << "Unexpected distance function chosen for dim " << dim;
-//         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
-//             << "AVX512 with dim " << dim;
-//         ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
-//         // Unset optimizations flag, so we'll choose the next optimization.
-//         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
-//         optimization.avx512vnni = 0;
-//     }
-//     #endif
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
 
-//     // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
+    // Test different optimizations based on CPU features
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = 0;
+    }
+    #endif
 
-//     // Test default implementation
-//     unsigned char alignment = 0;
-//     arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
-//     ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
-//     ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
-//         << "No optimization with dim " << dim;
-//     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
-// }
+    // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
+
+    // Test default implementation
+    unsigned char alignment = 0;
+    arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
 
 TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     auto optimization = getCpuOptimizationFeatures();

From 9a899ccf6ce85a8b966c9da12fd5ae8591a5ec70 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 14:23:44 +0300
Subject: [PATCH 21/52] replace OPT_AVX512_F_BW_VL_VNNI

---
 src/VecSim/spaces/IP_space.cpp | 4 ++--
 src/VecSim/spaces/L2_space.cpp | 2 +-
 tests/unit/test_spaces.cpp     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index f3d3dc07e..52aa5760f 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -66,7 +66,7 @@ namespace spaces {
         if (dim < 16) {
             return ret_dist_func;
         }
-    #ifdef OPT_AVX512F_BW_VL_VNNI
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
         if (features.avx512f && features.avx512bw && features.avx512vnni) {
             if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
                 *alignment = 16 * sizeof(float); // handles 16 floats
@@ -124,7 +124,7 @@ dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
         if (dim < 16) {
             return ret_dist_func;
         }
-    #ifdef OPT_AVX512F_BW_VL_VNNI
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
         if (features.avx512f && features.avx512bw && features.avx512vnni) {
             if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
                 *alignment = 16 * sizeof(float); // handles 16 floats
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index ff9976fe0..4febd8057 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -66,7 +66,7 @@ namespace spaces {
         if (dim < 16) {
             return ret_dist_func;
         }
-    #ifdef OPT_AVX512F_BW_VL_VNNI
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
         if (features.avx512f && features.avx512bw && features.avx512vnni) {
             if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
                 *alignment = 16 * sizeof(float); // handles 16 floats
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 664d78f25..0eb70aa16 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2193,7 +2193,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
 
     // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512F_BW_VL_VNNI
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2300,7 +2300,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
     #endif
 
     // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512F_BW_VL_VNNI
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);

From 4fa53277add948335d7e33fac313c162f1b3ee01 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 14:24:37 +0300
Subject: [PATCH 22/52] align

---
 tests/unit/test_spaces.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 0eb70aa16..8e54f8a2a 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2138,7 +2138,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;
-        ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = 0;
     }

From 1379d6d260f789fae3009ea3c62d4debd08b8c98 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 15:59:52 +0300
Subject: [PATCH 23/52] Fix avx

---
 src/VecSim/spaces/IP/IP_AVX_SQ8.h             |  2 +
 src/VecSim/spaces/L2/L2.cpp                   | 10 +-
 .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h     |  8 +-
 src/VecSim/spaces/L2/L2_AVX_SQ8.h             | 99 ++++++++++++++++---
 src/VecSim/spaces/L2_space.cpp                | 14 +--
 src/VecSim/spaces/functions/AVX.cpp           |  7 ++
 src/VecSim/spaces/functions/AVX.h             |  1 +
 tests/unit/test_spaces.cpp                    | 18 +++-
 8 files changed, 125 insertions(+), 34 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
index 2fbd4401f..d28a13a4f 100644
--- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
@@ -67,6 +67,8 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
         
         // Dequantize: (val * delta) + min_val
         __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+        v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
+ 
         
         // Compute dot product with masking
         sum256 = _mm256_mul_ps(v1, v2_dequant);
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 08ea8674c..85e78edb2 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -10,6 +10,7 @@
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/types/float16.h"
 #include <cstring>
+#include <iostream>
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -22,14 +23,17 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     // The last two values are used to dequantize the vector.
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
 
     float res = 0;
     for (size_t i = 0; i < dimension; i++) {
-        auto dequantized_normalized_V2 = (pVect2[i] * delta + min_val) * inv_norm;
-        float t = pVect1[i] - dequantized_normalized_V2;
+        auto dequantized_V2 = (pVect2[i] * delta + min_val);
+        std::cout << dequantized_V2 << " ";
+        float t = pVect1[i] - dequantized_V2;
         res += t * t;
     }
+    // The last value is used to normalize the vector.
+    // The normalization is done by multiplying the result by the inverse of the norm.
+    std::cout << std::endl;
     return res;
 }
 
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
index 448388932..c3d06d1a3 100644
--- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
@@ -11,7 +11,7 @@
 // Helper function to perform L2 squared distance calculation for a chunk of 16 elements
 static inline void
 SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum,
-              const __m512 &min_val_vec, const __m512 &delta_vec, const __m512 &inv_norm_vec) {
+              const __m512 &min_val_vec, const __m512 &delta_vec) {
     // Load 16 float elements from pVect1
     __m512 v1 = _mm512_loadu_ps(pVect1);
 
@@ -24,7 +24,6 @@ SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum,
 
     // Dequantize: (val * delta + min_val) * inv_norm
     __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
-    dequantized = _mm512_mul_ps(dequantized, inv_norm_vec);
 
     // Compute difference
     __m512 diff = _mm512_sub_ps(v1, dequantized);
@@ -47,12 +46,10 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2
     // Get dequantization parameters from the end of pVect2
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
 
     // Create broadcast vectors for SIMD operations
     __m512 min_val_vec = _mm512_set1_ps(min_val);
     __m512 delta_vec = _mm512_set1_ps(delta);
-    __m512 inv_norm_vec = _mm512_set1_ps(inv_norm);
 
     // Initialize sum accumulator
     __m512 sum = _mm512_setzero_ps();
@@ -72,7 +69,6 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2
 
         // Dequantize: (val * delta + min_val) * inv_norm
         __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
-        dequantized = _mm512_mul_ps(dequantized, inv_norm_vec);
 
         // Compute difference
         __m512 diff = _mm512_sub_ps(v1, dequantized);
@@ -88,7 +84,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2
 
     // Process remaining full chunks of 16 elements
     do  {
-        SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec, inv_norm_vec);
+        SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
     }while (pVect1 < pEnd1);
 
     // Horizontal sum
diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
index e4cf82c45..715b147f3 100644
--- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
@@ -8,21 +8,48 @@
 */
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
+#include <iostream>
 
-static inline void L2SqrStep(float *&pVect1, float *&pVect2, __m256 &sum) {
+static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum, 
+                            const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
+    
+    // Load 8 uint8 elements from pVect2
+    __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+    
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+    
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+    
+    // Dequantize: (val * delta) + min_val
+    __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+    
+    // Compute difference
+    __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+    
+    // Square difference and add to sum
+    sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+    
+    // Advance pointers
     pVect1 += 8;
-    __m256 v2 = _mm256_loadu_ps(pVect2);
     pVect2 += 8;
-    __m256 diff = _mm256_sub_ps(v1, v2);
-    // sum = _mm256_fmadd_ps(diff, diff, sum);
-    sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
 }
 
 template <unsigned char residual> // 0..15
-float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
+float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float *pVect1 = (float *)pVect1v;
-    float *pVect2 = (float *)pVect2v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    float *pVect1_debug = (float *)pVect1v;
+    uint8_t *pVect2_debug = (uint8_t *)pVect2v; 
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
 
     const float *pEnd1 = pVect1 + dimension;
 
@@ -30,25 +57,65 @@ float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dime
 
     // Deal with 1-7 floats with mask loading, if needed
     if constexpr (residual % 8) {
-        __mmask8 constexpr mask8 = (1 << (residual % 8)) - 1;
-        __m256 v1 = my_mm256_maskz_loadu_ps<mask8>(pVect1);
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
-        __m256 v2 = my_mm256_maskz_loadu_ps<mask8>(pVect2);
+        
+        uint8_t temp_buf[8] = {0};
+        // Manually copy elements
+        for (size_t i = 0; i < residual % 8; i++) {
+            temp_buf[i] = pVect2[i];
+        }
+        // Load from buffer
+        __m128i v2_128 = _mm_loadl_epi64((__m128i*)temp_buf);
         pVect2 += residual % 8;
-        __m256 diff = _mm256_sub_ps(v1, v2);
+        
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+        
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+        
+        // Dequantize: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+        // print debug information
+        // std::cout << "v2_dequant before: ";
+        // for (size_t i = 0; i <  8; i++) {
+        //     std::cout <<  v2_dequant[i] << " ";
+        // }
+        // std::cout << std::endl;
+        
+        v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
+        // std::cout << "v2_dequant after: ";
+        // for (size_t i = 0; i <  8; i++) {
+        //     std::cout <<  v2_dequant[i] << " ";
+        // }
+        // std::cout << std::endl;
+
+        __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+
+
         sum = _mm256_mul_ps(diff, diff);
+        // print sum
     }
 
-    // If the reminder is >=8, have another step of 8 floats
+    // If the reminder is >= 8, have another step of 8 floats
     if constexpr (residual >= 8) {
-        L2SqrStep(pVect1, pVect2, sum);
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
     }
-
+    float naive_sum = 0;
+    for (size_t i = 0; i < residual; i++) {
+        auto dequantized_V2 = (pVect2_debug[i] * delta + min_val);
+        float t = pVect1_debug[i] - dequantized_V2;
+        naive_sum += t * t;
+    }
+    std::cout <<"residual: " << (int)residual << " " << naive_sum << " " << my_mm256_reduce_add_ps(sum) << std::endl;
+    
     // We dealt with the residual part. We are left with some multiple of 16 floats.
     // In each iteration we calculate 16 floats = 512 bits.
     do {
-        L2SqrStep(pVect1, pVect2, sum);
-        L2SqrStep(pVect1, pVect2, sum);
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
     } while (pVect1 < pEnd1);
 
     return my_mm256_reduce_add_ps(sum);
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 4febd8057..d7136c82f 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -73,13 +73,13 @@ namespace spaces {
             return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
         }
     #endif
-    // #ifdef OPT_AVX
-    //     if (features.avx) {
-    //         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-    //             *alignment = 8 * sizeof(float); // handles 8 floats
-    //         return Choose_SQ8_L2_implementation_AVX(dim);
-    //     }
-    // #endif
+    #ifdef OPT_AVX
+        if (features.avx) {
+            if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 8 * sizeof(float); // handles 8 floats
+            return Choose_SQ8_L2_implementation_AVX(dim);
+        }
+    #endif
     // #ifdef OPT_SSE
     //     if (features.sse) {
     //         if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp
index 33ef7b4dc..75ee8bf17 100644
--- a/src/VecSim/spaces/functions/AVX.cpp
+++ b/src/VecSim/spaces/functions/AVX.cpp
@@ -12,6 +12,7 @@
 #include "VecSim/spaces/L2/L2_AVX_FP64.h"
 
 #include "VecSim/spaces/IP/IP_AVX_SQ8.h"
+#include "VecSim/spaces/L2/L2_AVX_SQ8.h"
 #include "VecSim/spaces/IP/IP_AVX_FP32.h"
 #include "VecSim/spaces/IP/IP_AVX_FP64.h"
 
@@ -31,6 +32,12 @@ dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX);
+    return ret_dist_func;
+}
+
 dist_func_t<float> Choose_FP32_IP_implementation_AVX(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX);
diff --git a/src/VecSim/spaces/functions/AVX.h b/src/VecSim/spaces/functions/AVX.h
index ccdede166..416c8d5f8 100644
--- a/src/VecSim/spaces/functions/AVX.h
+++ b/src/VecSim/spaces/functions/AVX.h
@@ -14,6 +14,7 @@ namespace spaces {
 
 dist_func_t<float> Choose_SQ8_IP_implementation_AVX(size_t dim);
 dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX(size_t dim);
 
 dist_func_t<float> Choose_FP32_IP_implementation_AVX(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_AVX(size_t dim);
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 8e54f8a2a..2477a5fc9 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -435,6 +435,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
     // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
     size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
     spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+    spaces::GetNormalizeFunc<float>()(v2_orig, dim);
     // Find min and max for quantization
     float min_val = v2_orig[0];
     float max_val = v2_orig[0];
@@ -2117,7 +2118,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         v1_orig[i] = float(i + 1.5);
         v2_orig[i] = float(i * 0.75 + 1.0);
     }
-
+    
     // Create SQ8 compressed version of v2
     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
 
@@ -2128,7 +2129,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
-
+    std::cout << "baseline: " << baseline << std::endl;
     // Test different optimizations based on CPU features
     #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
@@ -2143,6 +2144,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         optimization.avx512f = 0;
     }
     #endif
+    #ifdef OPT_AVX
+    if (optimization.avx) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
+        // Unset avx flag as well, so we'll choose the next optimization (SSE).
+        optimization.avx = 0;
+    }
+    #endif
 
     // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
 

From f7fdb2b25cbe5b9ee4c6d0064922541f23183afe Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 16:27:26 +0300
Subject: [PATCH 24/52] add l2 sse

---
 src/VecSim/spaces/IP/IP_AVX2_SQ8.h            |  30 ++---
 src/VecSim/spaces/L2/L2_SSE_SQ8.h             | 126 ++++++++++++++++++
 src/VecSim/spaces/L2_space.cpp                |  14 +-
 src/VecSim/spaces/functions/SSE.cpp           |   7 +
 src/VecSim/spaces/functions/SSE.h             |   2 +
 .../spaces_benchmarks/bm_spaces_sq8.cpp       |  70 +++++-----
 tests/unit/test_spaces.cpp                    |  13 ++
 7 files changed, 198 insertions(+), 64 deletions(-)
 create mode 100644 src/VecSim/spaces/L2/L2_SSE_SQ8.h

diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
index 6d0dd4af7..df2f134f1 100644
--- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -36,14 +36,14 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float *pVect1 = (float *)pVect1v;
-    uint8_t *quantized = (uint8_t *)pVect2v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
 
     // Get dequantization parameters from the end of quantized vector
-    float min = *(float *)(quantized + dimension);
-    float delta = *(float *)(quantized + dimension + sizeof(float));
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
     
     // Create broadcast vectors for SIMD operations
-    __m256 min_val_vec = _mm256_set1_ps(min);
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
     __m256 delta_vec = _mm256_set1_ps(delta);
 
     const float *pEnd1 = pVect1 + dimension;
@@ -60,19 +60,8 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size
         pVect1 += residual % 8;
         
         // Load masked uint8 elements
-        __m128i v2_128;
-        if constexpr (residual % 8 <= 4) {
-            // Load 4 or fewer bytes
-            uint32_t temp = 0;
-            memcpy(&temp, quantized, residual % 8);
-            v2_128 = _mm_cvtsi32_si128(temp);
-        } else {
-            // Load 5-7 bytes
-            uint64_t temp = 0;
-            memcpy(&temp, quantized, residual % 8);
-            v2_128 = _mm_cvtsi64_si128(temp);
-        }
-        quantized += residual % 8;
+        __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+        pVect2 += residual % 8;
         
         // Zero-extend uint8 to int32 (AVX2 instruction)
         __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
@@ -82,6 +71,7 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size
         
         // Dequantize: (val * delta) + min (using FMA)
         __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+        v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
         
         // Compute dot product with masking
         sum256 = _mm256_mul_ps(v1, v2_dequant);
@@ -89,14 +79,14 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size
 
     // If the reminder is >=8, have another step of 8 floats
     if constexpr (residual >= 8) {
-        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
     }
 
     // We dealt with the residual part. We are left with some multiple of 16 floats.
     // In each iteration we calculate 16 floats = 512 bits.
     while (pVect1 < pEnd1) {
-        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
-        InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
     }
 
     // Horizontal sum - AVX2 can use more efficient reduction
diff --git a/src/VecSim/spaces/L2/L2_SSE_SQ8.h b/src/VecSim/spaces/L2/L2_SSE_SQ8.h
new file mode 100644
index 000000000..89cd7db1a
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SSE_SQ8.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+*/
+#include "VecSim/spaces/space_includes.h"
+#include <string.h>
+
+static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum,
+                            const __m128 &min_val_vec, const __m128 &delta_vec) {
+    // Load 4 float elements from pVect1
+    __m128 v1 = _mm_loadu_ps(pVect1);
+    pVect1 += 4;
+    
+    // Load 4 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2)));
+    pVect2 += 4;
+    
+    // Convert int32 to float
+    __m128 v2_f = _mm_cvtepi32_ps(v2_i);
+    
+    // Dequantize: (val * delta) + min_val
+    __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
+    
+    // Compute difference
+    __m128 diff = _mm_sub_ps(v1, v2_dequant);
+    
+    // Square difference and add to sum
+    sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    
+    // Create broadcast vectors for SIMD operations
+    __m128 min_val_vec = _mm_set1_ps(min_val);
+    __m128 delta_vec = _mm_set1_ps(delta);
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m128 sum = _mm_setzero_ps();
+
+    // Process residual elements if needed
+    if constexpr (residual) {
+        // Handle residual elements (1-3)
+        if constexpr (residual % 4) {
+            __m128 v1;
+            __m128 v2_dequant = _mm_setzero_ps();
+            
+            if constexpr (residual % 4 == 3) {
+                // Load 3 floats and set the last one to 0
+                v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0
+                v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part
+                
+                // Dequantize first value
+                float dequant0 = pVect2[0] * delta + min_val;
+                v2_dequant = _mm_load_ss(&dequant0);
+                
+                // Dequantize next two values
+                float dequant_high[2] = {
+                    pVect2[1] * delta + min_val,
+                    pVect2[2] * delta + min_val
+                };
+                v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high);
+                
+            } else if constexpr (residual % 4 == 2) {
+                // Load 2 floats and set the last two to 0
+                v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
+                
+                // Dequantize two values
+                float dequant_high[2] = {
+                    pVect2[0] * delta + min_val,
+                    pVect2[1] * delta + min_val
+                };
+                v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high);
+                
+            } else if constexpr (residual % 4 == 1) {
+                // Load 1 float and set the last three to 0
+                v1 = _mm_load_ss(pVect1);
+                
+                // Dequantize one value
+                float dequant0 = pVect2[0] * delta + min_val;
+                v2_dequant = _mm_load_ss(&dequant0);
+            }
+            
+            pVect1 += residual % 4;
+            pVect2 += residual % 4;
+            
+            // Compute difference
+            __m128 diff = _mm_sub_ps(v1, v2_dequant);
+            
+            // Square difference and initialize sum
+            sum = _mm_mul_ps(diff, diff);
+        }
+
+        // Process remaining blocks of 4 elements based on residual
+        if constexpr (residual >= 12)
+            L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        if constexpr (residual >= 8)
+            L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        if constexpr (residual >= 4)
+            L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+    }
+
+    // Process 16 elements at a time (4 elements per step, 4 steps)
+    while (pVect1 < pEnd1) {
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+    }
+    
+    // TmpRes must be 16 bytes aligned
+    float PORTABLE_ALIGN16 TmpRes[4];
+    _mm_store_ps(TmpRes, sum);
+    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index d7136c82f..cf142924e 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -80,13 +80,13 @@ namespace spaces {
             return Choose_SQ8_L2_implementation_AVX(dim);
         }
     #endif
-    // #ifdef OPT_SSE
-    //     if (features.sse) {
-    //         if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
-    //             *alignment = 4 * sizeof(float); // handles 4 floats
-    //         return Choose_SQ8_L2_implementation_SSE(dim);
-    //     }
-    // #endif
+    #ifdef OPT_SSE
+        if (features.sse) {
+            if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
+                *alignment = 4 * sizeof(float); // handles 4 floats
+            return Choose_SQ8_L2_implementation_SSE(dim);
+        }
+    #endif
     #endif // __x86_64__
         return ret_dist_func;
     }
diff --git a/src/VecSim/spaces/functions/SSE.cpp b/src/VecSim/spaces/functions/SSE.cpp
index dd218d957..f08395fab 100644
--- a/src/VecSim/spaces/functions/SSE.cpp
+++ b/src/VecSim/spaces/functions/SSE.cpp
@@ -10,6 +10,7 @@
 
 #include "VecSim/spaces/L2/L2_SSE_FP32.h"
 #include "VecSim/spaces/L2/L2_SSE_FP64.h"
+#include "VecSim/spaces/L2/L2_SSE_SQ8.h"
 
 #include "VecSim/spaces/IP/IP_SSE_FP32.h"
 #include "VecSim/spaces/IP/IP_SSE_FP64.h"
@@ -31,6 +32,12 @@ dist_func_t<float> Choose_SQ8_Cosine_implementation_SSE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_L2_implementation_SSE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_SSE);
+    return ret_dist_func;
+}
+
 dist_func_t<float> Choose_FP32_IP_implementation_SSE(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_SSE);
diff --git a/src/VecSim/spaces/functions/SSE.h b/src/VecSim/spaces/functions/SSE.h
index a86921a9c..d7ee3349e 100644
--- a/src/VecSim/spaces/functions/SSE.h
+++ b/src/VecSim/spaces/functions/SSE.h
@@ -14,6 +14,8 @@ namespace spaces {
 
 dist_func_t<float> Choose_SQ8_IP_implementation_SSE(size_t dim);
 dist_func_t<float> Choose_SQ8_Cosine_implementation_SSE(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_SSE(size_t dim);
+
 dist_func_t<float> Choose_FP32_IP_implementation_SSE(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_SSE(size_t dim);
 
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
index cbf0b7e5b..5d7a6bb7b 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -5,69 +5,65 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "bm_spaces.h"
 #include "utils/tests_utils.h"
 
 class BM_VecSimSpaces_SQ8 : public benchmark::Fixture {
-    protected:
-        std::mt19937 rng;
-        size_t dim;
-        float *v1;
-        uint8_t *v2;
-    
-    public:
+protected:
+    std::mt19937 rng;
+    size_t dim;
+    float *v1;
+    uint8_t *v2;
+
+public:
     BM_VecSimSpaces_SQ8() { rng.seed(47); }
-        ~BM_VecSimSpaces_SQ8() = default;
-    
-        void SetUp(const ::benchmark::State &state) {
-            dim = state.range(0);
-            v1 = new float[dim];
-            test_utils::populate_float_vec(v1, dim, 123);
-            // Allocate vector with extra space for min, delta and cosine calculations
-            v2 = new uint8_t[dim + sizeof(float) * 3];
-            test_utils::populate_float_vec_to_sq8(v2, dim, 1234);
-        }
-        void TearDown(const ::benchmark::State &state) {
-            delete v1;
-            delete v2;
-        }
-    };
+    ~BM_VecSimSpaces_SQ8() = default;
+
+    void SetUp(const ::benchmark::State &state) {
+        dim = state.range(0);
+        v1 = new float[dim];
+        test_utils::populate_float_vec(v1, dim, 123);
+        // Allocate vector with extra space for min, delta and cosine calculations
+        v2 = new uint8_t[dim + sizeof(float) * 3];
+        test_utils::populate_float_vec_to_sq8(v2, dim, 1234);
+    }
+    void TearDown(const ::benchmark::State &state) {
+        delete v1;
+        delete v2;
+    }
+};
 
 #ifdef CPU_FEATURES_ARCH_X86_64
 cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
 // AVX512_F_BW_VL_VNNI functions
 #ifdef OPT_AVX512_F_BW_VL_VNNI
-bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw &&
-                                   opt.avx512vl && opt.avx512vnni;
-INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32,
+bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16,
                                 avx512_f_bw_vl_vnni_supported);
-// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
-//                                  avx512_f_bw_vl_vnni_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16,
+                                 avx512_f_bw_vl_vnni_supported);
 #endif // AVX512_F_BW_VL_VNNI
 
 #ifdef AVX2
 // AVX2 functions
 bool avx2_supported = opt.avx2;
-INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 32, avx2_supported);
-// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX2, 32,
-//                                  avx2_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported);
 #endif // AVX2
 
 // AVX functions
 #ifdef OPT_AVX
 bool avx_supported = opt.avx;
-INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX, 32, avx_supported);
-// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX, 32,
-//                                  avx_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX, 16, avx_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX, 16, avx_supported);
 #endif // AVX
 // SSE functions
 #ifdef OPT_SSE
 bool sse_supported = opt.sse;
-INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, SSE, 32, sse_supported);
-// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE, 32,
-//                                   sse_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SSE, 16, sse_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE, 16, sse_supported);
 #endif // SSE
 #endif // x86_64
 
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 2477a5fc9..d1b854073 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2157,6 +2157,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         optimization.avx = 0;
     }
     #endif
+    #ifdef OPT_SSE
+    if (optimization.sse) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SSE with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
+        // Unset sse flag as well, so we'll choose the next optimization (default).
+        optimization.sse = 0;
+    }
+    #endif
 
     // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
 

From 4fa88b2a3b15526ee4327b390e1c0f9d78346529 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 16:31:59 +0300
Subject: [PATCH 25/52] Remove prints

---
 src/VecSim/spaces/IP/IP_SSE_SQ8.h | 5 +----
 src/VecSim/spaces/L2/L2_AVX_SQ8.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
index deced094c..05b31da8d 100644
--- a/src/VecSim/spaces/IP/IP_SSE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
@@ -114,7 +114,6 @@ float SQ8_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_
     return 1.0f - SQ8_InnerProductSIMD16_SSE_IMP<residual>(pVect1v, pVect2v, dimension);
 }
 
-
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) {
 
@@ -125,9 +124,7 @@ float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dime
     // Compute inner product with dequantization using the common function
     // We need to cast away const for the inner product function, but it doesn't modify the vectors
     const float res = SQ8_InnerProductSIMD16_SSE_IMP<residual>(pVect1v, pVect2v, dimension);
-    
-    std::cout << "res before normalization sse: " << res << std::endl;
-    std::cout << "inv_norm: " << inv_norm << std::endl;
+
     // For cosine, we need to account for the vector norms
     // The inv_norm parameter is stored after min_val and delta in the quantized vector
     return 1.0f - res * inv_norm;
diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
index 715b147f3..0d21d6476 100644
--- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
@@ -109,7 +109,6 @@ float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimen
         float t = pVect1_debug[i] - dequantized_V2;
         naive_sum += t * t;
     }
-    std::cout <<"residual: " << (int)residual << " " << naive_sum << " " << my_mm256_reduce_add_ps(sum) << std::endl;
     
     // We dealt with the residual part. We are left with some multiple of 16 floats.
     // In each iteration we calculate 16 floats = 512 bits.

From 4476833b4673279f8dd63da8ca3c95f3717972dc Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 11 May 2025 17:17:26 +0300
Subject: [PATCH 26/52] sve2 l2

---
 src/VecSim/spaces/IP/IP.cpp          |   3 -
 src/VecSim/spaces/L2/L2_SVE_SQ8.h    | 134 +++++++++++++++++++++++++++
 src/VecSim/spaces/L2_space.cpp       |  42 ++++-----
 src/VecSim/spaces/functions/SVE.cpp  |   7 ++
 src/VecSim/spaces/functions/SVE.h    |   1 +
 src/VecSim/spaces/functions/SVE2.cpp |   7 ++
 src/VecSim/spaces/functions/SVE2.h   |   1 +
 tests/unit/test_spaces.cpp           | 134 +++++++++++----------------
 8 files changed, 223 insertions(+), 106 deletions(-)
 create mode 100644 src/VecSim/spaces/L2/L2_SVE_SQ8.h

diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index a1e5cb8e7..fd666341a 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -19,14 +19,11 @@ using float16 = vecsim_types::float16;
 float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, float min_val,
                          float delta, float inv_norm) {
     float res = 0;
-    std::cout << "\nQuantized values: ";
     for (size_t i = 0; i < dimension; i++) {
         float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm;
         std::cout << dequantized_V2 << ", ";
         res += pVect1v[i] * dequantized_V2;
     }
-    std::cout << "\n";
-    std::cout << "res before normalization: " << res << std::endl;
     return res;
 }
 
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
new file mode 100644
index 000000000..e52fe5e21
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+*/
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+// Helper function to perform L2 squared distance calculation for a chunk of elements
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
+                            svfloat32_t &sum, const svfloat32_t &min_val_vec, 
+                            const svfloat32_t &delta_vec) {
+    svbool_t pg = svptrue_b32();
+    
+    // Load float elements from pVect1
+    svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
+    
+    // Load uint8 elements from pVect2, convert to int32, then to float
+    svbool_t pg_b8 = svptrue_b8();
+    svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset);
+    
+    // Convert uint8 to uint32
+    svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+    
+    // Convert uint32 to float32
+    svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
+    
+    // Dequantize: (val * delta) + min_val
+    svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec);
+    
+    // Compute difference
+    svfloat32_t diff = svsub_f32_z(pg, v1, v2_dequant);
+    
+    // Square difference and add to sum
+    sum = svmla_f32_z(pg, sum, diff, diff);
+    
+    // Move to the next set of elements
+    offset += svcntw();
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    size_t offset = 0;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    
+    // Create broadcast vectors for SIMD operations
+    svbool_t pg = svptrue_b32();
+    svfloat32_t min_val_vec = svdup_f32(min_val);
+    svfloat32_t delta_vec = svdup_f32(delta);
+
+    // Get the number of 32-bit elements per vector at runtime
+    uint64_t sve_word_count = svcntw();
+    
+    // Multiple accumulators to increase instruction-level parallelism
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    // Handle partial chunk if needed
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % sve_word_count;
+        if (remaining > 0) {
+            // Create predicate for the remaining elements
+            svbool_t pg_partial = svwhilelt_b32(0, remaining);
+            
+            // Load float elements from pVect1 with predicate
+            svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
+            
+            // Load uint8 elements from pVect2 with predicate, convert to int32, then to float
+            svbool_t pg_b8_partial = svwhilelt_b8(0, remaining);
+            svuint8_t v2_u8 = svld1_u8(pg_b8_partial, pVect2);
+            
+            // Convert uint8 to uint32
+            svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+            
+            // Convert uint32 to float32
+            svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
+            
+            // Dequantize: (val * delta) + min_val
+            svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
+            
+            // Compute difference
+            svfloat32_t diff = svsub_f32_z(pg_partial, v1, v2_dequant);
+            
+            // Square difference and add to sum
+            sum0 = svmla_f32_z(pg_partial, sum0, diff, diff);
+            
+            // Move pointers past the partial chunk
+            offset += remaining;
+        }
+    }
+    // Handle remaining steps (0-3)
+    if constexpr (additional_steps > 0) {
+        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
+    }
+    if constexpr (additional_steps > 1) {
+        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
+    }
+    if constexpr (additional_steps > 2) {
+        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
+    }
+    
+
+    // Process 4 chunks at a time in the main loop
+    auto chunk_size = 4 * sve_word_count;
+    size_t number_of_chunks = dimension / chunk_size;
+    
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec);
+    }
+    
+
+    // Combine the accumulators
+    svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
+    sum = svadd_f32_z(pg, sum, sum2);
+    sum = svadd_f32_z(pg, sum, sum3);
+    
+    // Horizontal sum of all elements in the vector
+    float result = svaddv_f32(pg, sum);
+    
+    return result;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index cf142924e..6e4086f74 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -38,31 +38,31 @@ namespace spaces {
         if (!alignment) {
             alignment = &dummy_alignment;
         }
-    
+
         dist_func_t<float> ret_dist_func = SQ8_L2Sqr;
-    
+
         [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
-    // #ifdef CPU_FEATURES_ARCH_AARCH64
-    // #ifdef OPT_SVE2
-    //     if (features.sve2) {
-    //         return Choose_FP32_L2_implementation_SVE2(dim);
-    //     }
-    // #endif
-    // #ifdef OPT_SVE
-    //     if (features.sve) {
-    //         return Choose_FP32_L2_implementation_SVE(dim);
-    //     }
-    // #endif
-    // #ifdef OPT_NEON
-    //     if (features.asimd) {
-    //         return Choose_FP32_L2_implementation_NEON(dim);
-    //     }
-    // #endif
-    // #endif
-    
+    #ifdef CPU_FEATURES_ARCH_AARCH64
+    #ifdef OPT_SVE2
+        if (features.sve2) {
+            return Choose_SQ8_L2_implementation_SVE2(dim);
+        }
+    #endif
+    #ifdef OPT_SVE
+        if (features.sve) {
+            return Choose_SQ8_L2_implementation_SVE(dim);
+        }
+    #endif
+    #ifdef OPT_NEON
+        if (features.asimd) {
+            return Choose_SQ8_L2_implementation_NEON(dim);
+        }
+    #endif
+    #endif
+
     #ifdef CPU_FEATURES_ARCH_X86_64
         // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
-    
+
         if (dim < 16) {
             return ret_dist_func;
         }
diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp
index 39098bd8c..d4952ef38 100644
--- a/src/VecSim/spaces/functions/SVE.cpp
+++ b/src/VecSim/spaces/functions/SVE.cpp
@@ -23,6 +23,7 @@
 #include "VecSim/spaces/L2/L2_SVE_UINT8.h"
 #include "VecSim/spaces/IP/IP_SVE_UINT8.h"
 #include "VecSim/spaces/IP/IP_SVE_SQ8.h"
+#include "VecSim/spaces/L2/L2_SVE_SQ8.h"
 
 namespace spaces {
 
@@ -109,6 +110,12 @@ dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h
index 86f7a7094..a24dfe326 100644
--- a/src/VecSim/spaces/functions/SVE.h
+++ b/src/VecSim/spaces/functions/SVE.h
@@ -31,5 +31,6 @@ dist_func_t<float> Choose_UINT8_IP_implementation_SVE(size_t dim);
 
 dist_func_t<float> Choose_SQ8_IP_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE(size_t dim);
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp
index 52ba020a4..c5f1626f9 100644
--- a/src/VecSim/spaces/functions/SVE2.cpp
+++ b/src/VecSim/spaces/functions/SVE2.cpp
@@ -21,6 +21,7 @@
 #include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/IP/IP_SVE_SQ8.h"   // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_SQ8.h"   // SVE2 implementation is identical to SVE
 
 namespace spaces {
 
@@ -107,6 +108,12 @@ dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h
index cd3570caf..57f1b8694 100644
--- a/src/VecSim/spaces/functions/SVE2.h
+++ b/src/VecSim/spaces/functions/SVE2.h
@@ -31,5 +31,6 @@ dist_func_t<float> Choose_UINT8_IP_implementation_SVE2(size_t dim);
 
 dist_func_t<float> Choose_SQ8_IP_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE2(size_t dim);
 
 } // namespace spaces
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index d1b854073..b24d17782 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2171,7 +2171,32 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
     }
     #endif
 
-    // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
+    #ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+    #endif
+    #ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+    #endif
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2256,6 +2281,32 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         optimization.sse = 0;
     }
     #endif
+    #ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+    #endif
+    #ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+    #endif
 
 
     // Test default implementation
@@ -2376,84 +2427,3 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }
-
-// Instantiate the test suite with dimensions to test
-INSTANTIATE_TEST_SUITE_P(SQ8CosineTest, SQ8SpacesOptimizationTest,
-    testing::Range(16UL, 16 * 2UL + 1));
-
-// TEST_P(SQ8SpacesOptimizationTest, SQ8_full_range_test) {
-//     auto optimization = getCpuOptimizationFeatures();
-//     constexpr size_t dim = 512;
-
-//     // Create vectors with full range of values
-//     std::vector<float> v1(dim);
-//     std::vector<float> v2(dim);
-
-//     // v1: 0..255 followed by 255..0
-//     for (size_t i = 0; i < 256; i++) {
-//         v1[i] = static_cast<float>(i) / 255.0f;
-//         v1[256 + i] = static_cast<float>(255 - i) / 255.0f;
-//     }
-
-//     // v2: 255..0 followed by 0..255
-//     for (size_t i = 0; i < 256; i++) {
-//         v2[i] = static_cast<float>(255 - i) / 255.0f;
-//         v2[256 + i] = static_cast<float>(i) / 255.0f;
-//     }
-
-//     // Create SQ8 compressed version of v2
-//     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2.data(), dim, false);
-
-//     // Create normalized version of v1 for cosine
-//     std::vector<float> v1_norm(v1);
-//     spaces::GetNormalizeFunc<float>()(v1_norm.data(), dim);
-
-//     // Create normalized SQ8 compressed version of v2 for cosine
-//     std::vector<uint8_t> v2_compressed_norm = CreateSQ8CompressedVector(v2.data(), dim, true);
-
-//     float baseline_l2 = SQ8_L2Sqr(v1.data(), v2_compressed.data(), dim);
-//     float baseline_ip = SQ8_InnerProduct(v1.data(), v2_compressed.data(), dim);
-//     float baseline_cosine = SQ8_Cosine(v1_norm.data(), v2_compressed_norm.data(), dim);
-
-//     dist_func_t<float> arch_opt_func;
-
-//     // Test different optimizations for each metric
-//     #ifdef OPT_AVX512F
-//     if (optimization.avx512f) {
-//         // L2 test
-//         arch_opt_func = Choose_SQ8_L2_implementation_AVX512F(dim);
-//         ASSERT_NEAR(baseline_l2, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01)
-//             << "L2 AVX512 with dim " << dim;
-
-//         // IP test
-//         arch_opt_func = Choose_SQ8_IP_implementation_AVX512F(dim);
-//         ASSERT_NEAR(baseline_ip, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01)
-//             << "IP AVX512 with dim " << dim;
-
-//         // Cosine test
-//         arch_opt_func = Choose_SQ8_Cosine_implementation_AVX512F(dim);
-//         ASSERT_NEAR(baseline_cosine, arch_opt_func(v1_norm.data(), v2_compressed_norm.data(),
-//         dim), 0.01)
-//             << "Cosine AVX512 with dim " << dim;
-
-//         optimization.avx512f = 0;
-//     }
-//     #endif
-
-//     // Add other optimizations as needed (SVE2, SVE, NEON, etc.)
-
-
-// Instantiate the test suite with dimensions to test
-INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest,
-                         testing::Range(16UL, 16 * 2UL + 1));
-
-// #endif // defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64)
-
-// class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
-
-// TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
-//     auto optimization = getCpuOptimizationFeatures();
-//     size_t dim = GetParam();
-//     int8_t v1[dim];
-//     int8_t v2[dim];
-//     test_utils::populate_int8_vec(v1, dim

From 2a7477c67d1cbe14c68cc2d346a409af6ee73fad Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 09:46:08 +0300
Subject: [PATCH 27/52] add neon

---
 src/VecSim/spaces/IP/IP_NEON_SQ8.h            | 128 ++++++++++++++++++
 src/VecSim/spaces/IP_space.cpp                |  10 +-
 src/VecSim/spaces/L2/L2_NEON_SQ8.h            | 112 +++++++++++++++
 src/VecSim/spaces/functions/NEON.cpp          |  20 +++
 src/VecSim/spaces/functions/NEON.h            |   4 +
 .../spaces_benchmarks/bm_spaces_sq8.cpp       |  20 +++
 tests/unit/test_spaces.cpp                    |  13 ++
 7 files changed, 302 insertions(+), 5 deletions(-)
 create mode 100644 src/VecSim/spaces/IP/IP_NEON_SQ8.h
 create mode 100644 src/VecSim/spaces/L2/L2_NEON_SQ8.h

diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
new file mode 100644
index 000000000..a95f6da20
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+*/
+#include "VecSim/spaces/space_includes.h"
+#include <arm_neon.h>
+
+static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum,
+                                   const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
+    // Load 4 float elements from pVect1
+    float32x4_t v1 = vld1q_f32(pVect1);
+    pVect1 += 4;
+
+    // Load 4 uint8 elements from pVect2
+    uint8x8_t v2_u8 = vld1_u8(pVect2);
+    pVect2 += 4;
+
+    // Convert uint8 to uint32
+    uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8)));
+
+    // Convert uint32 to float32
+    float32x4_t v2_f = vcvtq_f32_u32(v2_u32);
+
+    // Dequantize: (val * delta) + min_val
+    float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec);
+
+    // Compute dot product and add to sum
+    sum = vmlaq_f32(sum, v1, v2_dequant);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    float32x4_t min_val_vec = vdupq_n_f32(min_val);
+    float32x4_t delta_vec = vdupq_n_f32(delta);
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+
+    // Process 16 elements at a time in the main loop
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, sum3, min_val_vec, delta_vec);
+    }
+
+    // Handle remaining complete 4-float blocks within residual
+    if constexpr (residual >= 4) {
+        InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 8) {
+        InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 12) {
+        InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+    }
+
+    // Handle final residual elements (0-3 elements)
+    constexpr size_t final_residual = residual % 4;
+    if constexpr (final_residual > 0) {
+        float32x4_t v1 = vdupq_n_f32(0.0f);
+        float32x4_t v2_dequant = vdupq_n_f32(0.0f);
+
+        if constexpr (final_residual >= 1) {
+            v1 = vld1q_lane_f32(pVect1, v1, 0);
+            float dequant0 = pVect2[0] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0);
+        }
+        if constexpr (final_residual >= 2) {
+            v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
+            float dequant1 = pVect2[1] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1);
+        }
+        if constexpr (final_residual >= 3) {
+            v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
+            float dequant2 = pVect2[2] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2);
+        }
+
+        sum3 = vmlaq_f32(sum3, v1, v2_dequant);
+    }
+
+    // Combine all four sum accumulators
+    float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
+
+    // Horizontal sum of the 4 elements in the combined NEON register
+    float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
+    float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
+    float sum = vget_lane_f32(summed, 0);
+
+    return sum;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductSIMD16_NEON_IMP<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get quantization parameters
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Compute inner product with dequantization using the common function
+    const float res = SQ8_InnerProductSIMD16_NEON_IMP<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - res * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 52aa5760f..93609475d 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -53,11 +53,11 @@ namespace spaces {
             return Choose_SQ8_IP_implementation_SVE(dim);
         }
     #endif
-    // #ifdef OPT_NEON
-    //     if (features.asimd) {
-    //         return Choose_SQ8_IP_implementation_NEON(dim);
-    //     }
-    // #endif
+    #ifdef OPT_NEON
+        if (features.asimd) {
+            return Choose_SQ8_IP_implementation_NEON(dim);
+        }
+    #endif
 
     #endif
 
diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
new file mode 100644
index 000000000..617389cbb
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+*/
+#include "VecSim/spaces/space_includes.h"
+#include <arm_neon.h>
+
+static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum,
+                            const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
+    // Load 4 float elements from pVect1
+    float32x4_t v1 = vld1q_f32(pVect1);
+    pVect1 += 4;
+
+    // Load 4 uint8 elements from pVect2
+    uint8x8_t v2_u8 = vld1_u8(pVect2);
+    pVect2 += 4;
+
+    // Convert uint8 to uint32
+    uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8)));
+
+    // Convert uint32 to float32
+    float32x4_t v2_f = vcvtq_f32_u32(v2_u32);
+
+    // Dequantize: (val * delta) + min_val
+    float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec);
+
+    // Compute difference
+    float32x4_t diff = vsubq_f32(v1, v2_dequant);
+
+    // Square difference and add to sum
+    sum = vmlaq_f32(sum, diff, diff);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float *pVect1 = (float *)pVect1v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    float32x4_t min_val_vec = vdupq_n_f32(min_val);
+    float32x4_t delta_vec = vdupq_n_f32(delta);
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+
+    // Process 16 elements at a time in the main loop
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum3, min_val_vec, delta_vec);
+    }
+
+    // Handle remaining complete 4-float blocks within residual
+    if constexpr (residual >= 4) {
+        L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 8) {
+        L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 12) {
+        L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+    }
+
+    // Handle final residual elements (0-3 elements)
+    constexpr size_t final_residual = residual % 4;
+    if constexpr (final_residual > 0) {
+        float32x4_t v1 = vdupq_n_f32(0.0f);
+        float32x4_t v2_dequant = vdupq_n_f32(0.0f);
+
+        if constexpr (final_residual >= 1) {
+            v1 = vld1q_lane_f32(pVect1, v1, 0);
+            float dequant0 = pVect2[0] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0);
+        }
+        if constexpr (final_residual >= 2) {
+            v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
+            float dequant1 = pVect2[1] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1);
+        }
+        if constexpr (final_residual >= 3) {
+            v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
+            float dequant2 = pVect2[2] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2);
+        }
+
+        float32x4_t diff = vsubq_f32(v1, v2_dequant);
+        sum3 = vmlaq_f32(sum3, diff, diff);
+    }
+
+    // Combine all four sum accumulators
+    float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
+
+    // Horizontal sum of the 4 elements in the combined NEON register
+    float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
+    float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
+    float sum = vget_lane_f32(summed, 0);
+
+    return sum;
+}
diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp
index bd15c6577..debfa90c0 100644
--- a/src/VecSim/spaces/functions/NEON.cpp
+++ b/src/VecSim/spaces/functions/NEON.cpp
@@ -15,6 +15,8 @@
 #include "VecSim/spaces/IP/IP_NEON_UINT8.h"
 #include "VecSim/spaces/L2/L2_NEON_FP64.h"
 #include "VecSim/spaces/IP/IP_NEON_FP64.h"
+#include "VecSim/spaces/L2/L2_NEON_SQ8.h"
+#include "VecSim/spaces/IP/IP_NEON_SQ8.h"
 
 namespace spaces {
 
@@ -79,6 +81,24 @@ dist_func_t<double> Choose_FP64_L2_implementation_NEON(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_L2_implementation_NEON(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_NEON);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_IP_implementation_NEON(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_NEON);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_NEON(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_NEON);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h
index 7da0de6b8..4478cc149 100644
--- a/src/VecSim/spaces/functions/NEON.h
+++ b/src/VecSim/spaces/functions/NEON.h
@@ -26,4 +26,8 @@ dist_func_t<float> Choose_FP32_L2_implementation_NEON(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_NEON(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_NEON(size_t dim);
 
+dist_func_t<float> Choose_SQ8_L2_implementation_NEON(size_t dim);
+dist_func_t<float> Choose_SQ8_IP_implementation_NEON(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_NEON(size_t dim);
+
 } // namespace spaces
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
index 5d7a6bb7b..03e9d5477 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -34,6 +34,26 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture {
     }
 };
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
+
+// NEON implementation for ARMv8-a
+#ifdef OPT_NEON
+bool neon_supported = opt.asimd; // ARMv8-a always supports NEON
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported);
+#endif
+// SVE implementation
+#ifdef OPT_SVE
+bool sve_supported = opt.sve; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported);
+#endif
+// SVE2 implementation
+#ifdef OPT_SVE2
+bool sve2_supported = opt.sve2; // Check for SVE2 support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported);
+#endif
+#endif // AARCH64
+
 #ifdef CPU_FEATURES_ARCH_X86_64
 cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index b24d17782..6f88bff62 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2376,6 +2376,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         optimization.sve = 0;
     }
     #endif
+    #ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "NEON with dim " << dim;
+        // We don't align SQ8 vectors with cosine distance
+        // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim;
+        optimization.asimd = 0;
+    }
+    #endif
 
     // Test different optimizations based on CPU features
     #ifdef OPT_AVX512_F_BW_VL_VNNI

From b1f502c18a20aa0edd6d2cb33fe33589ae62fce4 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 10:33:11 +0300
Subject: [PATCH 28/52] fix sve

---
 src/VecSim/spaces/IP/IP.cpp       |  2 --
 src/VecSim/spaces/IP/IP_SVE_SQ8.h | 43 +++++++++++++------------------
 src/VecSim/spaces/L2/L2.cpp       |  2 --
 src/VecSim/spaces/L2/L2_SVE_SQ8.h | 16 +++---------
 tests/unit/test_spaces.cpp        | 41 +++++++++++++++++++----------
 5 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index fd666341a..395e69dce 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -21,7 +21,6 @@ float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, s
     float res = 0;
     for (size_t i = 0; i < dimension; i++) {
         float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm;
-        std::cout << dequantized_V2 << ", ";
         res += pVect1v[i] * dequantized_V2;
     }
     return res;
@@ -48,7 +47,6 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
     const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
-    std::cout << "inv_norm: " << inv_norm << std::endl;
     // Compute inner product with dequantization
     const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
     return 1.0f - res;
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
index d6c0faa3d..bc80a8785 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -19,12 +19,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &of
     // Load float elements from pVect1
     svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
     
-    // Load uint8 elements from pVect2, convert to int32, then to float
-    svbool_t pg_b8 = svptrue_b8();
-    svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset);
-    
     // Convert uint8 to uint32
-    svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+    svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa
     
     // Convert uint32 to float32
     svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
@@ -42,12 +38,12 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &of
 template <bool partial_chunk, unsigned char additional_steps>
 float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float *pVect1 = (float *)pVect1v;
-    uint8_t *quantized = (uint8_t *)pVect2v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
     size_t offset = 0;
 
     // Get dequantization parameters from the end of quantized vector
-    float min = *(float *)(quantized + dimension);
-    float delta = *(float *)(quantized + dimension + sizeof(float));
+    float min = *(float *)(pVect2 + dimension);
+    float delta = *(float *)(pVect2 + dimension + sizeof(float));
     
     // Create broadcast vectors for SIMD operations
     svbool_t pg = svptrue_b32();
@@ -68,17 +64,15 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
         size_t remaining = dimension % sve_word_count;
         if (remaining > 0) {
             // Create predicate for the remaining elements
-            svbool_t pg_partial = svwhilelt_b32(0, remaining);
-            
+            svbool_t pg_partial = svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
+
             // Load float elements from pVect1 with predicate
             svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
             
-            // Load uint8 elements from pVect2 with predicate, convert to int32, then to float
-            svbool_t pg_b8_partial = svwhilelt_b8(0, remaining);
-            svuint8_t v2_u8 = svld1_u8(pg_b8_partial, quantized);
-            
-            // Convert uint8 to uint32
-            svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+
+            // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane
+            svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset);  // LD1UB: load 8-bit, zero-extend to 32-bit :contentReference[oaicite:0]{index=0}
+
             
             // Convert uint32 to float32
             svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
@@ -90,8 +84,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
             sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant);
             
             // Move pointers past the partial chunk
-            pVect1 += remaining;
-            quantized += remaining;
+            offset += remaining;
         }
     }
 
@@ -100,21 +93,21 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
     const size_t number_of_chunks = (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size;
     
     for (size_t i = 0; i < number_of_chunks; i++) {
-        InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec);
-        InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec);
-        InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec);
-        InnerProductStep(pVect1, quantized, offset, sum3, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec);
     }
     
     // Handle remaining steps (0-3)
     if constexpr (additional_steps > 0) {
-        InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
     }
     if constexpr (additional_steps > 1) {
-        InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
     }
     if constexpr (additional_steps > 2) {
-        InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
     }
     
     // Combine the accumulators
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 85e78edb2..a8a1f5040 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -27,13 +27,11 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float res = 0;
     for (size_t i = 0; i < dimension; i++) {
         auto dequantized_V2 = (pVect2[i] * delta + min_val);
-        std::cout << dequantized_V2 << " ";
         float t = pVect1[i] - dequantized_V2;
         res += t * t;
     }
     // The last value is used to normalize the vector.
     // The normalization is done by multiplying the result by the inverse of the norm.
-    std::cout << std::endl;
     return res;
 }
 
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
index e52fe5e21..2cfdb15ad 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -10,7 +10,7 @@
 #include <arm_sve.h>
 
 // Helper function to perform L2 squared distance calculation for a chunk of elements
-static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
+static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, size_t &offset,
                             svfloat32_t &sum, const svfloat32_t &min_val_vec, 
                             const svfloat32_t &delta_vec) {
     svbool_t pg = svptrue_b32();
@@ -18,12 +18,8 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_
     // Load float elements from pVect1
     svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
     
-    // Load uint8 elements from pVect2, convert to int32, then to float
-    svbool_t pg_b8 = svptrue_b8();
-    svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset);
-    
     // Convert uint8 to uint32
-    svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+    svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset);
     
     // Convert uint32 to float32
     svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
@@ -70,17 +66,13 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
         size_t remaining = dimension % sve_word_count;
         if (remaining > 0) {
             // Create predicate for the remaining elements
-            svbool_t pg_partial = svwhilelt_b32(0, remaining);
+            svbool_t pg_partial = svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
             
             // Load float elements from pVect1 with predicate
             svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
             
             // Load uint8 elements from pVect2 with predicate, convert to int32, then to float
-            svbool_t pg_b8_partial = svwhilelt_b8(0, remaining);
-            svuint8_t v2_u8 = svld1_u8(pg_b8_partial, pVect2);
-            
-            // Convert uint8 to uint32
-            svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8));
+            svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset);
             
             // Convert uint32 to float32
             svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 6f88bff62..a977be3b0 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -470,8 +470,6 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
     params[0] = min_val;
     params[1] = delta;
     params[2] = inv_norm;
-    std::cout << "min_val: " << min_val << ", delta: " << delta << ", inv_norm: " << inv_norm
-              << std::endl;
 
     float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
     ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance";
@@ -2129,7 +2127,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
-    std::cout << "baseline: " << baseline << std::endl;
     // Test different optimizations based on CPU features
     #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
@@ -2197,6 +2194,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         optimization.sve = 0;
     }
     #endif
+    #ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "NEON with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.asimd = 0;
+    }
+    #endif
+
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2219,17 +2230,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         v2_orig[i] = float(i * 0.75 + 1.0);
     }
     spaces::GetNormalizeFunc<float>()(v1_orig.data(), dim);
-    // print v1_orig
-    std::cout << "v1_normalized: ";
-    for (size_t i = 0; i < dim; i++) {
-        std::cout << v1_orig[i] << ", ";
-    }
-    std::cout << std::endl;
-    std::cout << "v2_orig: ";
-    for (size_t i = 0; i < dim; i++) {
-        std::cout << v2_orig[i] << ", ";
-    }
-    std::cout << std::endl;
 
     // Create SQ8 compressed version of v2
     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
@@ -2307,6 +2307,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         optimization.sve = 0;
     }
     #endif
+    #ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "NEON with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.asimd = 0;
+    }
+    #endif
 
 
     // Test default implementation

From dc154b5fa4ea49fbad29160bf1be6f5925d59b0d Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 10:42:33 +0300
Subject: [PATCH 29/52] add sq8 cosine test

---
 tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
index 03e9d5477..ddee91c49 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -41,16 +41,19 @@ cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
 #ifdef OPT_NEON
 bool neon_supported = opt.asimd; // ARMv8-a always supports NEON
 INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported);
 #endif
 // SVE implementation
 #ifdef OPT_SVE
 bool sve_supported = opt.sve; // Check for SVE support
 INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported);
 #endif
 // SVE2 implementation
 #ifdef OPT_SVE2
 bool sve2_supported = opt.sve2; // Check for SVE2 support
 INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported);
 #endif
 #endif // AARCH64
 

From 25a9400ba6593b6c92aae2672aa553402c8780fc Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 13:34:31 +0300
Subject: [PATCH 30/52] test utils

---
 src/VecSim/spaces/IP/IP.cpp                |   7 +-
 src/VecSim/spaces/IP/IP.h                  |   8 +-
 src/VecSim/spaces/L2/L2.cpp                |   2 -
 src/VecSim/spaces/L2/L2.h                  |   1 +
 src/VecSim/spaces/L2/L2_AVX_SQ8.h          |  25 +---
 src/VecSim/spaces/L2_space.h               |   2 +-
 src/VecSim/spaces/computer/preprocessors.h | 131 ---------------------
 tests/unit/test_spaces.cpp                 |  78 ++----------
 tests/utils/tests_utils.h                  |  39 +++---
 9 files changed, 43 insertions(+), 250 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index 395e69dce..d93671058 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -10,7 +10,6 @@
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/types/float16.h"
 #include <cstring>
-#include <iostream>
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -20,16 +19,16 @@ float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, s
                          float delta, float inv_norm) {
     float res = 0;
     for (size_t i = 0; i < dimension; i++) {
-        float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm;
+        float dequantized_V2 = (pVect2v[i] * delta + min_val);
         res += pVect1v[i] * dequantized_V2;
     }
-    return res;
+    return res * inv_norm;
 }
 
 float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    // pVect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it.
+    // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply it.
     // it is structured as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)]
     // The last two values are used to dequantize the vector.
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
index 7dfad24ce..d4796cbd6 100644
--- a/src/VecSim/spaces/IP/IP.h
+++ b/src/VecSim/spaces/IP/IP.h
@@ -5,15 +5,15 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #pragma once
 
 #include <cstdlib>
 
-/*
-    pVect1v vector of type fp32 and pVect2v vector of type int8
-*/
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
 float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);
+
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
 
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index a8a1f5040..42f219409 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -30,8 +30,6 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
         float t = pVect1[i] - dequantized_V2;
         res += t * t;
     }
-    // The last value is used to normalize the vector.
-    // The normalization is done by multiplying the result by the inverse of the norm.
     return res;
 }
 
diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h
index 055e8c630..6f1d25927 100644
--- a/src/VecSim/spaces/L2/L2.h
+++ b/src/VecSim/spaces/L2/L2.h
@@ -10,6 +10,7 @@
 
 #include <cstdlib>
 
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
 float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
 
 float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
index 0d21d6476..53034df0e 100644
--- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
@@ -8,7 +8,6 @@
 */
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
-#include <iostream>
 
 static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum, 
                             const __m256 &min_val_vec, const __m256 &delta_vec) {
@@ -42,8 +41,6 @@ template <unsigned char residual> // 0..15
 float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float *pVect1 = (float *)pVect1v;
     uint8_t *pVect2 = (uint8_t *)pVect2v;
-    float *pVect1_debug = (float *)pVect1v;
-    uint8_t *pVect2_debug = (uint8_t *)pVect2v; 
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
@@ -78,38 +75,22 @@ float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimen
         
         // Dequantize: (val * delta) + min_val
         __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
-        // print debug information
-        // std::cout << "v2_dequant before: ";
-        // for (size_t i = 0; i <  8; i++) {
-        //     std::cout <<  v2_dequant[i] << " ";
-        // }
-        // std::cout << std::endl;
+
         
         v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
-        // std::cout << "v2_dequant after: ";
-        // for (size_t i = 0; i <  8; i++) {
-        //     std::cout <<  v2_dequant[i] << " ";
-        // }
-        // std::cout << std::endl;
 
         __m256 diff = _mm256_sub_ps(v1, v2_dequant);
 
 
         sum = _mm256_mul_ps(diff, diff);
-        // print sum
+
     }
 
     // If the reminder is >= 8, have another step of 8 floats
     if constexpr (residual >= 8) {
         L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
     }
-    float naive_sum = 0;
-    for (size_t i = 0; i < residual; i++) {
-        auto dequantized_V2 = (pVect2_debug[i] * delta + min_val);
-        float t = pVect1_debug[i] - dequantized_V2;
-        naive_sum += t * t;
-    }
-    
+
     // We dealt with the residual part. We are left with some multiple of 16 floats.
     // In each iteration we calculate 16 floats = 512 bits.
     do {
diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h
index a58fcd7e4..c26757be4 100644
--- a/src/VecSim/spaces/L2_space.h
+++ b/src/VecSim/spaces/L2_space.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
- */
+*/
 #pragma once
 #include "VecSim/spaces/spaces.h"
 
diff --git a/src/VecSim/spaces/computer/preprocessors.h b/src/VecSim/spaces/computer/preprocessors.h
index ae434ea69..1045299b4 100644
--- a/src/VecSim/spaces/computer/preprocessors.h
+++ b/src/VecSim/spaces/computer/preprocessors.h
@@ -111,134 +111,3 @@ class CosinePreprocessor : public PreprocessorInterface {
     spaces::normalizeVector_f<DataType> normalize_func;
     const size_t dim;
 };
-
-template <typename DataType>
-class QuantPreprocessor : public PreprocessorInterface {
-public:
-    QuantPreprocessor(std::shared_ptr<VecSimAllocator> allocator, size_t dim, size_t bits_per_dim = 8)
-        : PreprocessorInterface(allocator), dim(dim), bits_per_dim(bits_per_dim),
-          compressed_bytes_count(calculateCompressedSize(dim)) {}
-
-    void preprocess(const void *original_blob, void *&storage_blob, void *&query_blob,
-                    size_t processed_bytes_count, unsigned char alignment) const override {
-        // Case 1: Blobs are different (one might be null, or both are allocated and processed separately)
-        if (storage_blob != query_blob) {
-            // Process storage blob (compress)
-            if (storage_blob == nullptr) {
-                storage_blob = this->allocator->allocate(compressed_bytes_count);
-                quantize(original_blob, storage_blob);
-            }
-            
-            // Query blob remains uncompressed
-            if (query_blob == nullptr) {
-                query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment);
-                memcpy(query_blob, original_blob, processed_bytes_count);
-            }
-        } else { // Case 2: Blobs are the same or both null
-            if (query_blob == nullptr) {
-                // For query, we keep the original format
-                query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment);
-                memcpy(query_blob, original_blob, processed_bytes_count);
-                
-                // For storage, we compress
-                storage_blob = this->allocator->allocate(compressed_bytes_count);
-                quantize(original_blob, storage_blob);
-            } else {
-                // If both point to the same memory, we need to separate them
-                void* new_storage = this->allocator->allocate(compressed_bytes_count);
-                quantize(query_blob, new_storage);
-                storage_blob = new_storage;
-            }
-        }
-    }
-
-    void preprocessForStorage(const void *original_blob, void *&blob,
-                              size_t processed_bytes_count) const override {
-        if (blob == nullptr) {
-            blob = this->allocator->allocate(compressed_bytes_count);
-            quantize(original_blob, blob);
-        } else {
-            // If blob is already allocated, we need to compress in-place
-            void* temp = this->allocator->allocate(compressed_bytes_count);
-            quantize(blob, temp);
-            this->allocator->free_allocation(blob);
-            blob = temp;
-        }
-    }
-
-    void preprocessQuery(const void *original_blob, void *&blob, size_t processed_bytes_count,
-                         unsigned char alignment) const override {
-        // For query, we keep the original format
-        if (blob == nullptr) {
-            blob = this->allocator->allocate_aligned(processed_bytes_count, alignment);
-            memcpy(blob, original_blob, processed_bytes_count);
-        }
-    }
-
-    void preprocessQueryInPlace(void *blob, size_t processed_bytes_count,
-                                unsigned char alignment) const override {
-        // No compression for query vectors
-        assert(blob);
-    }
-
-    void preprocessStorageInPlace(void *blob, size_t processed_bytes_count) const override {
-        assert(blob);
-        // Create temporary storage for compressed data
-        void* temp = this->allocator->allocate(compressed_bytes_count);
-        quantize(blob, temp);
-        
-        // Copy compressed data back to original location
-        // Note: This assumes blob has enough space for the compressed data
-        memcpy(blob, temp, compressed_bytes_count);
-        this->allocator->free_allocation(temp);
-    }
-
-private:
-    const size_t dim;
-    const size_t bits_per_dim;
-    const size_t compressed_bytes_count;
-
-    // Calculate the size needed for the compressed vector
-    static size_t calculateCompressedSize(size_t dim) {
-        // Quantized values (int8 per dimension) + min (float32) + delta (float32)
-        return dim * sizeof(int8_t) + 2 * sizeof(float);
-    }
-
-    // Quantize the vector from original format to compressed format
-    void quantize(const void *src, void *dst) const {
-        const DataType* src_data = static_cast<const DataType*>(src);
-        
-        // Find min and max values in the vector
-        DataType min_val = src_data[0];
-        DataType max_val = src_data[0];
-        
-        for (size_t i = 0; i < dim; i++) {
-            DataType val = src_data[i];
-            min_val = val < min_val ? val : min_val;
-            max_val = val > max_val ? val : max_val;
-        }
-        
-        // Calculate delta (quantization step)
-        float delta = (max_val - min_val) / 255.0f;
-        if (delta == 0){
-            delta = 1.0f; // Avoid division by zero if all values are the same
-        }
-        
-        // Structure of compressed data:
-        // [quantized values (int8_t * dim)][min_val (float)][delta (float)]
-        int8_t* quant_values = static_cast<int8_t*>(dst); // convert to int8_t pointer
-        float* params = reinterpret_cast<float*>(quant_values + dim); // convert to float pointer starting after quantized values
-        
-        // Store min and delta values for dequantization
-        params[0] = static_cast<float>(min_val);
-        params[1] = delta;
-        
-        // Quantize each value
-        for (size_t i = 0; i < dim; i++) {
-            float normalized = (src_data[i] - min_val) / delta;
-            if (normalized < 0) normalized = 0;
-            if (normalized > 255) normalized = 255;
-            quant_values[i] = static_cast<int8_t>(normalized);
-        }
-    }
-};
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index a977be3b0..0374a774b 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -316,43 +316,16 @@ void common_ip_sq8(bool should_normalize, float expected_dist) {
         v2_orig[i] = float(i + 1.5);
     }
 
-    // Create SQ8 compressed version of v2
-    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
     size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    std::vector<uint8_t> v2_compressed(compressed_size);
     if (should_normalize) {
         spaces::GetNormalizeFunc<float>()(v1_orig, dim);
         spaces::GetNormalizeFunc<float>()(v2_orig, dim);
     }
-
-    // Find min and max for quantization
-    float min_val = v2_orig[0];
-    float max_val = v2_orig[0];
-    for (size_t i = 1; i < dim; i++) {
-        min_val = std::min(min_val, v2_orig[i]);
-        max_val = std::max(max_val, v2_orig[i]);
-    }
-
-    // Calculate delta and inverse norm
-    float delta = (max_val - min_val) / 255.0f;
-    if (delta == 0)
-        delta = 1.0f; // Avoid division by zero
-
-    std::vector<uint8_t> v2_compressed(compressed_size);
-
-    // Quantize v2
-    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
-    float *params = reinterpret_cast<float *>(quant_values + dim);
-
-    // Store parameters
-    params[0] = min_val;
-    params[1] = delta;
-
-    // Quantize each value
-    for (size_t i = 0; i < dim; i++) {
-        float normalized = (v2_orig[i] - min_val) / delta;
-        normalized = std::max(0.0f, std::min(255.0f, normalized));
-        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
-    }
+    
+    // Create SQ8 compressed version of v2
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data());
 
     float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
 
@@ -380,47 +353,20 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) {
         v2_orig[i] = float(i + 1.5);
     }
 
-    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
     size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
-    spaces::GetNormalizeFunc<float>()(v1_orig, dim);
-    // Find min and max for quantization
-    float min_val = v2_orig[0];
-    float max_val = v2_orig[0];
-    for (size_t i = 1; i < dim; i++) {
-        min_val = std::min(min_val, v2_orig[i]);
-        max_val = std::max(max_val, v2_orig[i]);
-    }
-    // Calculate delta and inverse norm
-    float delta = (max_val - min_val) / 255.0f;
-    if (delta == 0)
-        delta = 1.0f; // Avoid division by zero
-
-    // Compress v2
     std::vector<uint8_t> v2_compressed(compressed_size);
-    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
-    float *params = reinterpret_cast<float *>(quant_values + dim);
 
-    // Quantize each value
-    for (size_t i = 0; i < dim; i++) {
-        float normalized = (v2_orig[i] - min_val) / delta;
-        normalized = std::max(0.0f, std::min(255.0f, normalized));
-        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
-    }
-    // Calculate inverse norm from decompressed values
-    float inv_norm = 0.0f;
-    for (size_t i = 0; i < dim; i++) {
-        float decompressed_value = min_val + quant_values[i] * delta;
-        inv_norm += decompressed_value * decompressed_value;
-    }
-    inv_norm = 1.0f / std::sqrt(inv_norm);
-    // Store parameters
-    params[0] = min_val;
-    params[1] = delta;
-    params[2] = inv_norm;
+    spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+    spaces::GetNormalizeFunc<float>()(v2_orig, dim);
+
+    // Create SQ8 compressed version of v2
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data());
 
     float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
     ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance";
 }
+
 TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
     // create a vector with extra space for the norm
     size_t dim = 5;
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index bb041b0af..1485d332f 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -50,44 +50,43 @@ static void populate_float_vec(float *v, size_t dim, int seed = 1234) {
     }
 }
 
-static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) {
-
-    std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
-    std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
-    std::vector<float> vec_copy(dim);
-    for (size_t i = 0; i < dim; i++) {
-        vec_copy[i] = dis(gen);
-    }
+static void quantize_float_vec_to_uint8(float *v, size_t dim, uint8_t *qv, int seed = 1234) {
 
-    // Find min and max for quantization
-    float min_val = vec_copy[0];
-    float max_val = vec_copy[0];
+    float min_val = v[0];
+    float max_val = v[0];
     for (size_t i = 1; i < dim; i++) {
-        min_val = std::min(min_val, vec_copy[i]);
-        max_val = std::max(max_val, vec_copy[i]);
+        min_val = std::min(min_val, v[i]);
+        max_val = std::max(max_val, v[i]);
     }
-
     // Calculate delta
     float delta = (max_val - min_val) / 255.0f;
     if (delta == 0)
         delta = 1.0f; // Avoid division by zero
-
     float norm = 0.0f;
     // Quantize each value
     for (size_t i = 0; i < dim; i++) {
-        float normalized = (vec_copy[i] - min_val) / delta;
+        float normalized = (v[i] - min_val) / delta;
         normalized = std::max(0.0f, std::min(255.0f, normalized));
-        v[i] = static_cast<uint8_t>(std::round(normalized));
-        norm += (v[i] * delta + min_val) * (v[i] * delta + min_val);
+        qv[i] = static_cast<uint8_t>(std::round(normalized));
+        norm += (qv[i] * delta + min_val) * (qv[i] * delta + min_val);
     }
-
     float inv_norm = 1.0f / std::sqrt(norm);
     // Store parameters
-    float *params = reinterpret_cast<float *>(v + dim);
+    float *params = reinterpret_cast<float *>(qv + dim);
     params[0] = min_val;
     params[1] = delta;
     params[2] = inv_norm;
+}
+
+static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) {
 
+    std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
+    std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
+    std::vector<float> vec(dim);
+    for (size_t i = 0; i < dim; i++) {
+        vec[i] = dis(gen);
+    }
+    quantize_float_vec_to_uint8(vec.data(), dim, v, seed);
 }
 
 

From 9ced0be4649f6b03cf3361254b7975dfbb7a1a69 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 13:39:46 +0300
Subject: [PATCH 31/52] static const

---
 src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 6 +++---
 src/VecSim/spaces/IP/IP_NEON_SQ8.h | 6 +++---
 src/VecSim/spaces/IP/IP_SSE_SQ8.h  | 6 +++---
 src/VecSim/spaces/IP/IP_SVE_SQ8.h  | 6 +++---
 src/VecSim/spaces/L2/L2_AVX_SQ8.h  | 6 +++---
 src/VecSim/spaces/L2/L2_NEON_SQ8.h | 6 +++---
 src/VecSim/spaces/L2/L2_SSE_SQ8.h  | 6 +++---
 src/VecSim/spaces/L2/L2_SVE_SQ8.h  | 7 +++----
 8 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
index df2f134f1..67e10bad1 100644
--- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -9,7 +9,7 @@
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
-static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256,
+static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
                                       const __m256 &min_val_vec, const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
@@ -35,8 +35,8 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
 
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
index a95f6da20..cafe2cab4 100644
--- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
@@ -9,7 +9,7 @@
 #include "VecSim/spaces/space_includes.h"
 #include <arm_neon.h>
 
-static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum,
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum,
                                    const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
     // Load 4 float elements from pVect1
     float32x4_t v1 = vld1q_f32(pVect1);
@@ -34,8 +34,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, float32x4_
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
 
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
index 05b31da8d..f7bae253e 100644
--- a/src/VecSim/spaces/IP/IP_SSE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
@@ -10,7 +10,7 @@
 #include <iostream>
 #include <string.h>
 
-static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum_prod,
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod,
                                     const __m128 &min_val_vec, const __m128 &delta_vec) {
     // Load 4 float elements from pVect1
     __m128 v1 = _mm_loadu_ps(pVect1);
@@ -32,8 +32,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, __m128 &su
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *quantized = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *quantized = static_cast<const uint8_t *>(pVect2v);
 
     // Get dequantization parameters from the end of quantized vector
     float min = *(float *)(quantized + dimension);
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
index bc80a8785..bbbe328d7 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -11,7 +11,7 @@
 #include <iostream>
 #include <string.h>
 
-static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &offset,
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
                                     svfloat32_t &sum, const svfloat32_t &min_val_vec, 
                                     const svfloat32_t &delta_vec) {
     svbool_t pg = svptrue_b32();
@@ -37,8 +37,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &of
 
 template <bool partial_chunk, unsigned char additional_steps>
 float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     size_t offset = 0;
 
     // Get dequantization parameters from the end of quantized vector
diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
index 53034df0e..be7e77fba 100644
--- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
@@ -9,7 +9,7 @@
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
-static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum, 
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum, 
                             const __m256 &min_val_vec, const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
@@ -39,8 +39,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum,
 
 template <unsigned char residual> // 0..15
 float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
index 617389cbb..24f6047a7 100644
--- a/src/VecSim/spaces/L2/L2_NEON_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
@@ -9,7 +9,7 @@
 #include "VecSim/spaces/space_includes.h"
 #include <arm_neon.h>
 
-static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum,
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum,
                             const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
     // Load 4 float elements from pVect1
     float32x4_t v1 = vld1q_f32(pVect1);
@@ -37,8 +37,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum,
 
 template <unsigned char residual> // 0..15
 float SQ8_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
 
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
diff --git a/src/VecSim/spaces/L2/L2_SSE_SQ8.h b/src/VecSim/spaces/L2/L2_SSE_SQ8.h
index 89cd7db1a..ded00b166 100644
--- a/src/VecSim/spaces/L2/L2_SSE_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SSE_SQ8.h
@@ -9,7 +9,7 @@
 #include "VecSim/spaces/space_includes.h"
 #include <string.h>
 
-static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum,
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum,
                             const __m128 &min_val_vec, const __m128 &delta_vec) {
     // Load 4 float elements from pVect1
     __m128 v1 = _mm_loadu_ps(pVect1);
@@ -34,8 +34,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum,
 
 template <unsigned char residual> // 0..15
 float SQ8_L2SqrSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
 
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
index 2cfdb15ad..8f76ce56f 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -9,8 +9,7 @@
 #include "VecSim/spaces/space_includes.h"
 #include <arm_sve.h>
 
-// Helper function to perform L2 squared distance calculation for a chunk of elements
-static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, size_t &offset,
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
                             svfloat32_t &sum, const svfloat32_t &min_val_vec, 
                             const svfloat32_t &delta_vec) {
     svbool_t pg = svptrue_b32();
@@ -39,8 +38,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, size_t &offset,
 
 template <bool partial_chunk, unsigned char additional_steps>
 float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float *pVect1 = (float *)pVect1v;
-    uint8_t *pVect2 = (uint8_t *)pVect2v;
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     size_t offset = 0;
 
     // Get dequantization parameters from the end of quantized vector

From 6028dd7ed870bcc4988198b58bded62cec2e7a06 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 15:24:07 +0300
Subject: [PATCH 32/52] format

---
 src/VecSim/spaces/IP/IP_AVX2_SQ8.h            |  32 +--
 .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h     |  39 ++--
 src/VecSim/spaces/IP/IP_AVX_SQ8.h             |  35 ++-
 src/VecSim/spaces/IP/IP_NEON_SQ8.h            |   2 +-
 src/VecSim/spaces/IP/IP_SSE_SQ8.h             |  46 ++--
 src/VecSim/spaces/IP/IP_SVE_SQ8.h             |  63 ++---
 src/VecSim/spaces/IP_space.cpp                | 221 +++++++++---------
 src/VecSim/spaces/IP_space.h                  |   2 +-
 src/VecSim/spaces/L2/L2.cpp                   |   4 +-
 .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h     |  19 +-
 src/VecSim/spaces/L2/L2_AVX_SQ8.h             |  33 ++-
 src/VecSim/spaces/L2/L2_SVE_SQ8.h             |  50 ++--
 src/VecSim/spaces/L2_space.cpp                | 110 ++++-----
 src/VecSim/spaces/functions/AVX512F.cpp       |   1 -
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |   1 -
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     |   4 +-
 tests/unit/test_bf16.cpp                      |   2 +-
 tests/unit/test_spaces.cpp                    |  94 ++++----
 tests/utils/tests_utils.h                     |   1 -
 19 files changed, 372 insertions(+), 387 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
index 67e10bad1..78151bf44 100644
--- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -5,30 +5,30 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
- */
+*/
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
 static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
-                                      const __m256 &min_val_vec, const __m256 &delta_vec) {
+                                       const __m256 &min_val_vec, const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
     pVect1 += 8;
-    
+
     // Load 8 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
     pVect2 += 8;
-    
+
     // Zero-extend uint8 to int32 (AVX2 instruction)
     __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-    
+
     // Convert int32 to float
     __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
-    
+
     // Dequantize: (val * delta) + min_val
     // Use FMA instruction available in AVX2 for better performance
     __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
-    
+
     // Compute dot product and add to sum (using FMA)
     sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
 }
@@ -41,7 +41,7 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    
+
     // Create broadcast vectors for SIMD operations
     __m256 min_val_vec = _mm256_set1_ps(min_val);
     __m256 delta_vec = _mm256_set1_ps(delta);
@@ -54,25 +54,25 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size
     if constexpr (residual % 8) {
         // AVX2 doesn't have native mask loading, so we use the helper function
         __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
-        
+
         // Load masked float elements
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
-        
+
         // Load masked uint8 elements
-        __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
         pVect2 += residual % 8;
-        
+
         // Zero-extend uint8 to int32 (AVX2 instruction)
         __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-        
+
         // Convert int32 to float
         __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
-        
+
         // Dequantize: (val * delta) + min (using FMA)
         __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
         v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
-        
+
         // Compute dot product with masking
         sum256 = _mm256_mul_ps(v1, v2_dequant);
     }
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
index b33b3629c..8bc0569da 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
@@ -5,20 +5,19 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
- */
+*/
 #pragma once
 #include "VecSim/spaces/space_includes.h"
 #include <immintrin.h>
 #include <iostream>
 
-static inline void
-SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum,
-                     const __m512 &min_val_vec, const __m512 &delta_vec) {
+static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum,
+                                        const __m512 &min_val_vec, const __m512 &delta_vec) {
     // Load 16 float elements from pVec1
     __m512 v1 = _mm512_loadu_ps(pVec1);
 
     // Load 16 uint8 elements from pVec2 and convert to __m512i
-    __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2);
+    __m128i v2_128 = _mm_loadu_si128((__m128i *)pVec2);
     __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
 
     // Convert uint8 to float
@@ -37,7 +36,8 @@ SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum,
 
 // Common implementation for both inner product and cosine similarity
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) {
+float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension,
+                          float inv_norm = 1.0f) {
     const float *pVec1 = static_cast<const float *>(pVec1v);
     const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
     const float *pEnd1 = pVec1 + dimension;
@@ -62,56 +62,53 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi
         __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
 
         // Load masked uint8 elements
-        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVec2));
+        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i *>(pVec2));
         __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
         __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
 
-
         // Dequantize
         __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
-        
+
         // Compute dot product
         __m512 product = _mm512_mul_ps(v1, dequantized);
 
-        
         // Apply mask to product and add to sum
         sum = _mm512_mask_add_ps(sum, mask, sum, product);
-        
+
         pVec1 += residual;
         pVec2 += residual;
     }
-    
+
     // Process remaining full chunks of 16 elements
     do {
         SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
     } while (pVec1 < pEnd1);
 
     // Return the raw inner product result
-    return _mm512_reduce_add_ps(sum);;
+    return _mm512_reduce_add_ps(sum);
+    ;
 }
 
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v,
-                                              const void *pVec2v,
-                                              size_t dimension) {
+float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                                size_t dimension) {
     // Calculate inner product using common implementation
     float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
-    
+
     // The inner product similarity is 1 - ip
     return 1.0f - ip;
 }
 
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
-                                         size_t dimension) {
+                                          size_t dimension) {
     // Get the inverse norm factor stored after min_val and delta
     const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
     const float inv_norm = *reinterpret_cast<const float *>(pVec2 + dimension + 2 * sizeof(float));
-    
+
     // Calculate inner product using common implementation with normalization
     float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension, inv_norm);
-    
+
     // The cosine similarity is 1 - ip
     return 1.0f - ip;
 }
-
diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
index d28a13a4f..385f7967e 100644
--- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h
@@ -5,29 +5,29 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
- */
+*/
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
 static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
-                                      const __m256 &min_val_vec, const __m256 &delta_vec) {
+                                       const __m256 &min_val_vec, const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
     pVect1 += 8;
-    
+
     // Load 8 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
     pVect2 += 8;
-    
+
     // Zero-extend uint8 to int32
     __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-    
+
     // Convert int32 to float
     __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
-    
+
     // Dequantize: (val * delta) + min_val
     __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
-    
+
     // Compute dot product and add to sum
     sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant));
 }
@@ -38,7 +38,7 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
     // pVect2 is a quantized uint8_t vector
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     const float *pEnd1 = pVect1 + dimension;
-    
+
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
@@ -54,22 +54,21 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
         __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
-        
+
         // Load quantized values and dequantize
-        __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
         pVect2 += residual % 8;
-        
+
         // Zero-extend uint8 to int32
         __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-        
+
         // Convert int32 to float
         __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
-        
+
         // Dequantize: (val * delta) + min_val
         __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
         v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
- 
-        
+
         // Compute dot product with masking
         sum256 = _mm256_mul_ps(v1, v2_dequant);
     }
@@ -99,10 +98,10 @@ float SQ8_CosineSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dime
     // Get dequantization parameters from the end of quantized vector
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
-    
+
     // Calculate inner product using common implementation with normalization
     float ip = SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
-    
+
     // For cosine, we need to account for the vector norms
     // The inv_norm parameter is stored after min_val and delta in the quantized vector
     return 1.0f - ip * inv_norm;
diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
index cafe2cab4..b2529439c 100644
--- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
@@ -10,7 +10,7 @@
 #include <arm_neon.h>
 
 static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum,
-                                   const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
+                                    const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
     // Load 4 float elements from pVect1
     float32x4_t v1 = vld1q_f32(pVect1);
     pVect1 += 4;
diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
index f7bae253e..a28f2cf12 100644
--- a/src/VecSim/spaces/IP/IP_SSE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
- */
+*/
 #include "VecSim/spaces/space_includes.h"
 #include <iostream>
 #include <string.h>
@@ -15,17 +15,17 @@ static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2
     // Load 4 float elements from pVect1
     __m128 v1 = _mm_loadu_ps(pVect1);
     pVect1 += 4;
-    
+
     // Load 4 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2)));
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2)));
     pVect2 += 4;
-    
+
     // Convert int32 to float
     __m128 v2_f = _mm_cvtepi32_ps(v2_i);
-    
+
     // Dequantize: (val * delta) + min_val
     __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
-    
+
     // Compute dot product and add to sum
     sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant));
 }
@@ -38,7 +38,7 @@ float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, s
     // Get dequantization parameters from the end of quantized vector
     float min = *(float *)(quantized + dimension);
     float delta = *(float *)(quantized + dimension + sizeof(float));
-    
+
     // Create broadcast vectors for SIMD operations
     __m128 min_val_vec = _mm_set1_ps(min);
     __m128 delta_vec = _mm_set1_ps(delta);
@@ -53,43 +53,37 @@ float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, s
         if constexpr (residual % 4) {
             __m128 v1;
             __m128 v2_dequant = _mm_setzero_ps();
-            
+
             if constexpr (residual % 4 == 3) {
                 // Load 3 floats and set the last one to 0
-                v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0
+                v1 = _mm_load_ss(pVect1);                     // load 1 float, set the rest to 0
                 v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part
-                
+
                 // Dequantize first value
                 float dequant0 = quantized[0] * delta + min;
                 v2_dequant = _mm_load_ss(&dequant0);
-                
+
                 // Dequantize next two values
-                float dequant_high[2] = {
-                    quantized[1] * delta + min,
-                    quantized[2] * delta + min
-                };
+                float dequant_high[2] = {quantized[1] * delta + min, quantized[2] * delta + min};
                 v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high);
-                
+
             } else if constexpr (residual % 4 == 2) {
                 // Load 2 floats and set the last two to 0
                 v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
-                
+
                 // Dequantize two values
-                float dequant_high[2] = {
-                    quantized[0] * delta + min,
-                    quantized[1] * delta + min
-                };
+                float dequant_high[2] = {quantized[0] * delta + min, quantized[1] * delta + min};
                 v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high);
-                
+
             } else if constexpr (residual % 4 == 1) {
                 // Load 1 float and set the last three to 0
                 v1 = _mm_load_ss(pVect1);
-                
+
                 // Dequantize one value
                 float dequant0 = quantized[0] * delta + min;
                 v2_dequant = _mm_load_ss(&dequant0);
             }
-            
+
             pVect1 += residual % 4;
             quantized += residual % 4;
             sum = _mm_mul_ps(v1, v2_dequant);
@@ -100,7 +94,7 @@ float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, s
     while (pVect1 < pEnd1) {
         InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec);
     }
-    
+
     // TmpRes must be 16 bytes aligned.
     float PORTABLE_ALIGN16 TmpRes[4];
     _mm_store_ps(TmpRes, sum);
@@ -120,7 +114,7 @@ float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dime
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     // Get quantization parameters
     const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
-    
+
     // Compute inner product with dequantization using the common function
     // We need to cast away const for the inner product function, but it doesn't modify the vectors
     const float res = SQ8_InnerProductSIMD16_SSE_IMP<residual>(pVect1v, pVect2v, dimension);
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
index bbbe328d7..4fe6ad5bb 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -5,32 +5,32 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
- */
+*/
 #include "VecSim/spaces/space_includes.h"
 #include <arm_sve.h>
 #include <iostream>
 #include <string.h>
 
 static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
-                                    svfloat32_t &sum, const svfloat32_t &min_val_vec, 
+                                    svfloat32_t &sum, const svfloat32_t &min_val_vec,
                                     const svfloat32_t &delta_vec) {
     svbool_t pg = svptrue_b32();
-    
+
     // Load float elements from pVect1
     svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
-    
+
     // Convert uint8 to uint32
     svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa
-    
+
     // Convert uint32 to float32
     svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
-    
+
     // Dequantize: (val * delta) + min_val
     svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec);
-    
+
     // Compute dot product and add to sum
     sum = svmla_f32_z(pg, sum, v1, v2_dequant);
-    
+
     // Move to the next set of elements
     offset += svcntw();
 }
@@ -44,7 +44,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
     // Get dequantization parameters from the end of quantized vector
     float min = *(float *)(pVect2 + dimension);
     float delta = *(float *)(pVect2 + dimension + sizeof(float));
-    
+
     // Create broadcast vectors for SIMD operations
     svbool_t pg = svptrue_b32();
     svfloat32_t min_val_vec = svdup_f32(min);
@@ -52,7 +52,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
 
     // Get the number of 32-bit elements per vector at runtime
     uint64_t sve_word_count = svcntw();
-    
+
     // Multiple accumulators to increase instruction-level parallelism
     svfloat32_t sum0 = svdup_f32(0.0f);
     svfloat32_t sum1 = svdup_f32(0.0f);
@@ -64,25 +64,27 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
         size_t remaining = dimension % sve_word_count;
         if (remaining > 0) {
             // Create predicate for the remaining elements
-            svbool_t pg_partial = svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
+            svbool_t pg_partial =
+                svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
 
             // Load float elements from pVect1 with predicate
             svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
-            
 
             // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane
-            svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset);  // LD1UB: load 8-bit, zero-extend to 32-bit :contentReference[oaicite:0]{index=0}
+            svuint32_t v2_u32 = svld1ub_u32(
+                pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit
+                                              // :contentReference[oaicite:0]{index=0}
 
-            
             // Convert uint32 to float32
             svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
-            
+
             // Dequantize: (val * delta) + min_val
-            svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
-            
+            svfloat32_t v2_dequant =
+                svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
+
             // Compute dot product and add to sum
             sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant);
-            
+
             // Move pointers past the partial chunk
             offset += remaining;
         }
@@ -90,15 +92,16 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
 
     // Process 4 chunks at a time in the main loop
     auto chunk_size = 4 * sve_word_count;
-    const size_t number_of_chunks = (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size;
-    
+    const size_t number_of_chunks =
+        (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size;
+
     for (size_t i = 0; i < number_of_chunks; i++) {
         InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
         InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
         InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
         InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec);
     }
-    
+
     // Handle remaining steps (0-3)
     if constexpr (additional_steps > 0) {
         InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
@@ -109,33 +112,35 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
     if constexpr (additional_steps > 2) {
         InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
     }
-    
+
     // Combine the accumulators
     svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
     sum = svadd_f32_z(pg, sum, sum2);
     sum = svadd_f32_z(pg, sum, sum3);
-    
+
     // Horizontal sum of all elements in the vector
     float result = svaddv_f32(pg, sum);
-    
+
     return result;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
 float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return 1.0f - SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+    return 1.0f - SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v,
+                                                                                dimension);
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
 float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    
+
     // Get quantization parameters
     const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
-    
+
     // Compute inner product with dequantization using the common function
-    const float res = SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
-    
+    const float res =
+        SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+
     // For cosine, we need to account for the vector norms
     // The inv_norm parameter is stored after min_val and delta in the quantized vector
     return 1.0f - res * inv_norm;
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 93609475d..9d49d072d 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -33,121 +33,122 @@ using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
 namespace spaces {
-    dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
-        unsigned char dummy_alignment;
-        if (alignment == nullptr) {
-            alignment = &dummy_alignment;
-        }
-
-        dist_func_t<float> ret_dist_func = SQ8_InnerProduct;
-        [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
-    #ifdef CPU_FEATURES_ARCH_AARCH64
-
-    #ifdef OPT_SVE2
-        if (features.sve2) {
-            return Choose_SQ8_IP_implementation_SVE2(dim);
-        }
-    #endif
-    #ifdef OPT_SVE
-        if (features.sve) {
-            return Choose_SQ8_IP_implementation_SVE(dim);
-        }
-    #endif
-    #ifdef OPT_NEON
-        if (features.asimd) {
-            return Choose_SQ8_IP_implementation_NEON(dim);
-        }
-    #endif
-
-    #endif
-
-    #ifdef CPU_FEATURES_ARCH_X86_64
-        // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
-        if (dim < 16) {
-            return ret_dist_func;
-        }
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
-        if (features.avx512f && features.avx512bw && features.avx512vnni) {
-            if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 16 * sizeof(float); // handles 16 floats
-            return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
-        }
-    #endif
-    #ifdef OPT_AVX
-        if (features.avx) {
-            if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 8 * sizeof(float); // handles 8 floats
-            return Choose_SQ8_IP_implementation_AVX(dim);
-        }
-    #endif
-    #ifdef OPT_SSE
-        if (features.sse) {
-            if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 4 * sizeof(float); // handles 4 floats
-            return Choose_SQ8_IP_implementation_SSE(dim);
-        }
-    #endif
-    #endif // __x86_64__
+dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = SQ8_InnerProduct;
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+#ifdef CPU_FEATURES_ARCH_AARCH64
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_IP_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_SQ8_IP_implementation_NEON(dim);
+    }
+#endif
+
+#endif
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+    if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 16 * sizeof(float); // handles 16 floats
+        return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#ifdef OPT_AVX
+    if (features.avx) {
+        if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 8 * sizeof(float); // handles 8 floats
+        return Choose_SQ8_IP_implementation_AVX(dim);
+    }
+#endif
+#ifdef OPT_SSE
+    if (features.sse) {
+        if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 4 * sizeof(float); // handles 4 floats
+        return Choose_SQ8_IP_implementation_SSE(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
+dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
+                                          const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = SQ8_Cosine;
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+#ifdef CPU_FEATURES_ARCH_AARCH64
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_Cosine_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_SQ8_Cosine_implementation_NEON(dim);
+    }
+#endif
+
+#endif
 
-dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
-        unsigned char dummy_alignment;
-        if (alignment == nullptr) {
-            alignment = &dummy_alignment;
-        }
-
-        dist_func_t<float> ret_dist_func = SQ8_Cosine;
-        [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
-    #ifdef CPU_FEATURES_ARCH_AARCH64
-
-    #ifdef OPT_SVE2
-        if (features.sve2) {
-            return Choose_SQ8_Cosine_implementation_SVE2(dim);
-        }
-    #endif
-    #ifdef OPT_SVE
-        if (features.sve) {
-            return Choose_SQ8_Cosine_implementation_SVE(dim);
-        }
-    #endif
-    #ifdef OPT_NEON
-        if (features.asimd) {
-            return Choose_SQ8_Cosine_implementation_NEON(dim);
-        }
-    #endif
-
-    #endif
-
-    #ifdef CPU_FEATURES_ARCH_X86_64
-        // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
-        if (dim < 16) {
-            return ret_dist_func;
-        }
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
-        if (features.avx512f && features.avx512bw && features.avx512vnni) {
-            if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 16 * sizeof(float); // handles 16 floats
-            return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
-        }
-    #endif
-    #ifdef OPT_AVX
-        if (features.avx) {
-            if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 8 * sizeof(float); // handles 8 floats
-            return Choose_SQ8_Cosine_implementation_AVX(dim);
-        }
-    #endif
-    #ifdef OPT_SSE
-        if (features.sse) {
-            if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 4 * sizeof(float); // handles 4 floats
-            return Choose_SQ8_Cosine_implementation_SSE(dim);
-        }
-    #endif
-    #endif // __x86_64__
+#ifdef CPU_FEATURES_ARCH_X86_64
+    // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+    if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 16 * sizeof(float); // handles 16 floats
+        return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#ifdef OPT_AVX
+    if (features.avx) {
+        if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 8 * sizeof(float); // handles 8 floats
+        return Choose_SQ8_Cosine_implementation_AVX(dim);
+    }
+#endif
+#ifdef OPT_SSE
+    if (features.sse) {
+        if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 4 * sizeof(float); // handles 4 floats
+        return Choose_SQ8_Cosine_implementation_SSE(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
 
 dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
     unsigned char dummy_alignment;
diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h
index e375e8e37..db2d0b2d9 100644
--- a/src/VecSim/spaces/IP_space.h
+++ b/src/VecSim/spaces/IP_space.h
@@ -30,5 +30,5 @@ dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = n
 dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                             const void *arch_opt = nullptr);
 dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
-                                           const void *arch_opt = nullptr);
+                                          const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 42f219409..1b40a587c 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -19,8 +19,8 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
     // pvect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it.
-    // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)]
-    // The last two values are used to dequantize the vector.
+    // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm
+    // (float)] The last two values are used to dequantize the vector.
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
 
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
index c3d06d1a3..c90aa35fd 100644
--- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
@@ -9,14 +9,13 @@
 #include "VecSim/spaces/space_includes.h"
 
 // Helper function to perform L2 squared distance calculation for a chunk of 16 elements
-static inline void
-SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum,
-              const __m512 &min_val_vec, const __m512 &delta_vec) {
+static inline void SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum,
+                                 const __m512 &min_val_vec, const __m512 &delta_vec) {
     // Load 16 float elements from pVect1
     __m512 v1 = _mm512_loadu_ps(pVect1);
 
     // Load 16 uint8 elements from pVect2 and convert to __m512i
-    __m128i v2_128 = _mm_loadu_si128((__m128i*)pVect2);
+    __m128i v2_128 = _mm_loadu_si128((__m128i *)pVect2);
     __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
 
     // Convert uint8 to float
@@ -38,7 +37,7 @@ SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum,
 
 template <unsigned char residual> // 0..15
 float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
-                                          size_t dimension) {
+                                         size_t dimension) {
     const float *pVect1 = static_cast<const float *>(pVect1v);
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     const float *pEnd1 = pVect1 + dimension;
@@ -53,7 +52,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2
 
     // Initialize sum accumulator
     __m512 sum = _mm512_setzero_ps();
-    
+
     // Handle residual elements (0 to 15)
     if constexpr (residual > 0) {
         // Create mask for residual elements
@@ -63,7 +62,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2
         __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1);
 
         // Load masked uint8 elements from pVect2
-        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i*>(pVect2));
+        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i *>(pVect2));
         __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
         __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
 
@@ -83,12 +82,12 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2
     }
 
     // Process remaining full chunks of 16 elements
-    do  {
+    do {
         SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
-    }while (pVect1 < pEnd1);
+    } while (pVect1 < pEnd1);
 
     // Horizontal sum
     float result = _mm512_reduce_add_ps(sum);
-    
+
     return result;
 }
diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
index be7e77fba..f6fceca0d 100644
--- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h
@@ -9,29 +9,29 @@
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
-static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum, 
-                            const __m256 &min_val_vec, const __m256 &delta_vec) {
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum,
+                             const __m256 &min_val_vec, const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
-    
+
     // Load 8 uint8 elements from pVect2
-    __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2);
-    
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+
     // Zero-extend uint8 to int32
     __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-    
+
     // Convert int32 to float
     __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
-    
+
     // Dequantize: (val * delta) + min_val
     __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
-    
+
     // Compute difference
     __m256 diff = _mm256_sub_ps(v1, v2_dequant);
-    
+
     // Square difference and add to sum
     sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-    
+
     // Advance pointers
     pVect1 += 8;
     pVect2 += 8;
@@ -57,33 +57,30 @@ float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimen
         __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
-        
+
         uint8_t temp_buf[8] = {0};
         // Manually copy elements
         for (size_t i = 0; i < residual % 8; i++) {
             temp_buf[i] = pVect2[i];
         }
         // Load from buffer
-        __m128i v2_128 = _mm_loadl_epi64((__m128i*)temp_buf);
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)temp_buf);
         pVect2 += residual % 8;
-        
+
         // Zero-extend uint8 to int32
         __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-        
+
         // Convert int32 to float
         __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
-        
+
         // Dequantize: (val * delta) + min_val
         __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
 
-        
         v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
 
         __m256 diff = _mm256_sub_ps(v1, v2_dequant);
 
-
         sum = _mm256_mul_ps(diff, diff);
-
     }
 
     // If the reminder is >= 8, have another step of 8 floats
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
index 8f76ce56f..7e3db05d5 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -10,28 +10,28 @@
 #include <arm_sve.h>
 
 static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
-                            svfloat32_t &sum, const svfloat32_t &min_val_vec, 
-                            const svfloat32_t &delta_vec) {
+                             svfloat32_t &sum, const svfloat32_t &min_val_vec,
+                             const svfloat32_t &delta_vec) {
     svbool_t pg = svptrue_b32();
-    
+
     // Load float elements from pVect1
     svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
-    
+
     // Convert uint8 to uint32
     svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset);
-    
+
     // Convert uint32 to float32
     svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
-    
+
     // Dequantize: (val * delta) + min_val
     svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec);
-    
+
     // Compute difference
     svfloat32_t diff = svsub_f32_z(pg, v1, v2_dequant);
-    
+
     // Square difference and add to sum
     sum = svmla_f32_z(pg, sum, diff, diff);
-    
+
     // Move to the next set of elements
     offset += svcntw();
 }
@@ -45,7 +45,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    
+
     // Create broadcast vectors for SIMD operations
     svbool_t pg = svptrue_b32();
     svfloat32_t min_val_vec = svdup_f32(min_val);
@@ -53,7 +53,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
 
     // Get the number of 32-bit elements per vector at runtime
     uint64_t sve_word_count = svcntw();
-    
+
     // Multiple accumulators to increase instruction-level parallelism
     svfloat32_t sum0 = svdup_f32(0.0f);
     svfloat32_t sum1 = svdup_f32(0.0f);
@@ -65,26 +65,28 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
         size_t remaining = dimension % sve_word_count;
         if (remaining > 0) {
             // Create predicate for the remaining elements
-            svbool_t pg_partial = svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
-            
+            svbool_t pg_partial =
+                svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
+
             // Load float elements from pVect1 with predicate
             svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
-            
+
             // Load uint8 elements from pVect2 with predicate, convert to int32, then to float
             svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset);
-            
+
             // Convert uint32 to float32
             svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
-            
+
             // Dequantize: (val * delta) + min_val
-            svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
-            
+            svfloat32_t v2_dequant =
+                svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
+
             // Compute difference
             svfloat32_t diff = svsub_f32_z(pg_partial, v1, v2_dequant);
-            
+
             // Square difference and add to sum
             sum0 = svmla_f32_z(pg_partial, sum0, diff, diff);
-            
+
             // Move pointers past the partial chunk
             offset += remaining;
         }
@@ -99,27 +101,25 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
     if constexpr (additional_steps > 2) {
         L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
     }
-    
 
     // Process 4 chunks at a time in the main loop
     auto chunk_size = 4 * sve_word_count;
     size_t number_of_chunks = dimension / chunk_size;
-    
+
     for (size_t i = 0; i < number_of_chunks; i++) {
         L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
         L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
         L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
         L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec);
     }
-    
 
     // Combine the accumulators
     svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
     sum = svadd_f32_z(pg, sum, sum2);
     sum = svadd_f32_z(pg, sum, sum3);
-    
+
     // Horizontal sum of all elements in the vector
     float result = svaddv_f32(pg, sum);
-    
+
     return result;
 }
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 6e4086f74..363330f29 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -33,63 +33,63 @@ using float16 = vecsim_types::float16;
 
 namespace spaces {
 
-    dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
-        unsigned char dummy_alignment;
-        if (!alignment) {
-            alignment = &dummy_alignment;
-        }
-
-        dist_func_t<float> ret_dist_func = SQ8_L2Sqr;
-
-        [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
-    #ifdef CPU_FEATURES_ARCH_AARCH64
-    #ifdef OPT_SVE2
-        if (features.sve2) {
-            return Choose_SQ8_L2_implementation_SVE2(dim);
-        }
-    #endif
-    #ifdef OPT_SVE
-        if (features.sve) {
-            return Choose_SQ8_L2_implementation_SVE(dim);
-        }
-    #endif
-    #ifdef OPT_NEON
-        if (features.asimd) {
-            return Choose_SQ8_L2_implementation_NEON(dim);
-        }
-    #endif
-    #endif
-
-    #ifdef CPU_FEATURES_ARCH_X86_64
-        // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
-
-        if (dim < 16) {
-            return ret_dist_func;
-        }
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
-        if (features.avx512f && features.avx512bw && features.avx512vnni) {
-            if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 16 * sizeof(float); // handles 16 floats
-            return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
-        }
-    #endif
-    #ifdef OPT_AVX
-        if (features.avx) {
-            if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 8 * sizeof(float); // handles 8 floats
-            return Choose_SQ8_L2_implementation_AVX(dim);
-        }
-    #endif
-    #ifdef OPT_SSE
-        if (features.sse) {
-            if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
-                *alignment = 4 * sizeof(float); // handles 4 floats
-            return Choose_SQ8_L2_implementation_SSE(dim);
-        }
-    #endif
-    #endif // __x86_64__
+dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (!alignment) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = SQ8_L2Sqr;
+
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_L2_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_L2_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_SQ8_L2_implementation_NEON(dim);
+    }
+#endif
+#endif
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+
+    if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 16 * sizeof(float); // handles 16 floats
+        return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#ifdef OPT_AVX
+    if (features.avx) {
+        if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 8 * sizeof(float); // handles 8 floats
+        return Choose_SQ8_L2_implementation_AVX(dim);
+    }
+#endif
+#ifdef OPT_SSE
+    if (features.sse) {
+        if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 4 * sizeof(float); // handles 4 floats
+        return Choose_SQ8_L2_implementation_SSE(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
 
 dist_func_t<float> L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
     unsigned char dummy_alignment;
diff --git a/src/VecSim/spaces/functions/AVX512F.cpp b/src/VecSim/spaces/functions/AVX512F.cpp
index c9124f3b4..bcddbea91 100644
--- a/src/VecSim/spaces/functions/AVX512F.cpp
+++ b/src/VecSim/spaces/functions/AVX512F.cpp
@@ -16,7 +16,6 @@
 #include "VecSim/spaces/IP/IP_AVX512F_FP32.h"
 #include "VecSim/spaces/IP/IP_AVX512F_FP64.h"
 
-
 namespace spaces {
 
 #include "implementation_chooser.h"
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 889725204..b383ab4e2 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -21,7 +21,6 @@ namespace spaces {
 
 #include "implementation_chooser.h"
 
-
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI);
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index 77eff5d57..745a339fb 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -5,15 +5,13 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #pragma once
 
 #include "VecSim/spaces/spaces.h"
 
 namespace spaces {
 
-
-
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
diff --git a/tests/unit/test_bf16.cpp b/tests/unit/test_bf16.cpp
index ebef947f0..458aeb80d 100644
--- a/tests/unit/test_bf16.cpp
+++ b/tests/unit/test_bf16.cpp
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 0374a774b..fcd75b70b 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -322,7 +322,7 @@ void common_ip_sq8(bool should_normalize, float expected_dist) {
         spaces::GetNormalizeFunc<float>()(v1_orig, dim);
         spaces::GetNormalizeFunc<float>()(v2_orig, dim);
     }
-    
+
     // Create SQ8 compressed version of v2
     // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
     test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data());
@@ -2062,7 +2062,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         v1_orig[i] = float(i + 1.5);
         v2_orig[i] = float(i * 0.75 + 1.0);
     }
-    
+
     // Create SQ8 compressed version of v2
     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
 
@@ -2073,8 +2073,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
-    // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2086,21 +2086,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = 0;
     }
-    #endif
-    #ifdef OPT_AVX
+#endif
+#ifdef OPT_AVX
     if (optimization.avx) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX(dim))
             << "Unexpected distance function chosen for dim " << dim;
-            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         // Unset avx flag as well, so we'll choose the next optimization (SSE).
         optimization.avx = 0;
     }
-    #endif
-    #ifdef OPT_SSE
+#endif
+#ifdef OPT_SSE
     if (optimization.sse) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2112,9 +2112,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sse flag as well, so we'll choose the next optimization (default).
         optimization.sse = 0;
     }
-    #endif
+#endif
 
-    #ifdef OPT_SVE2
+#ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2126,8 +2126,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sve2 flag as well, so we'll choose the next option (default).
         optimization.sve2 = 0;
     }
-    #endif
-    #ifdef OPT_SVE
+#endif
+#ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2139,21 +2139,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sve flag as well, so we'll choose the next option (default).
         optimization.sve = 0;
     }
-    #endif
-    #ifdef OPT_NEON
+#endif
+#ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim))
             << "Unexpected distance function chosen for dim " << dim;
-            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "NEON with dim " << dim;
         ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.asimd = 0;
     }
-    #endif
-
+#endif
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2190,8 +2189,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
 
-    // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2202,8 +2201,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
-    #endif
-    #ifdef OPT_AVX
+#endif
+#ifdef OPT_AVX
     if (optimization.avx) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2214,8 +2213,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         optimization.avx = 0;
     }
-    #endif
-    #ifdef OPT_SSE
+#endif
+#ifdef OPT_SSE
     if (optimization.sse) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2226,21 +2225,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
         optimization.sse = 0;
     }
-    #endif
-    #ifdef OPT_SVE2
+#endif
+#ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim))
             << "Unexpected distance function chosen for dim " << dim;
-            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
         ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
         // Unset sve2 flag as well, so we'll choose the next option (default).
         optimization.sve2 = 0;
     }
-    #endif
-    #ifdef OPT_SVE
+#endif
+#ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2252,8 +2251,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // Unset sve flag as well, so we'll choose the next option (default).
         optimization.sve = 0;
     }
-    #endif
-    #ifdef OPT_NEON
+#endif
+#ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2265,8 +2264,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.asimd = 0;
     }
-    #endif
-
+#endif
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2309,7 +2307,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim);
 
-    #ifdef OPT_SVE2
+#ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2321,8 +2319,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim;
         optimization.sve2 = 0;
     }
-    #endif
-    #ifdef OPT_SVE
+#endif
+#ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2334,8 +2332,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim;
         optimization.sve = 0;
     }
-    #endif
-    #ifdef OPT_NEON
+#endif
+#ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2347,10 +2345,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim;
         optimization.asimd = 0;
     }
-    #endif
+#endif
 
-    // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2362,8 +2360,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
-    #endif
-    #ifdef OPT_AVX
+#endif
+#ifdef OPT_AVX
     if (optimization.avx) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2375,9 +2373,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
         optimization.avx = 0;
     }
-    #endif
+#endif
 
-    #ifdef OPT_SSE
+#ifdef OPT_SSE
     if (optimization.sse) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2389,13 +2387,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim;
         optimization.sse = 0;
     }
-    #endif
+#endif
 
     // Test default implementation
     unsigned char alignment = 0;
     arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
-    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " <<
-    dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
         << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index 1485d332f..7aa18dbbe 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -89,7 +89,6 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) {
     quantize_float_vec_to_uint8(vec.data(), dim, v, seed);
 }
 
-
 template <typename datatype>
 float integral_compute_norm(const datatype *vec, size_t dim) {
     return spaces::IntegralType_ComputeNorm<datatype>(vec, dim);

From 3c2ee113dd5911ca5fe244dda65f22e9abb9dac3 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 15:46:57 +0300
Subject: [PATCH 33/52] change to uint

---
 src/VecSim/spaces/L2/L2.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 1b40a587c..03ade3885 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -18,8 +18,8 @@ using float16 = vecsim_types::float16;
 float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    // pvect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it.
-    // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm
+    // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply it.
+    // it structred as [quantized values (uint8_t * dim)][min_val (float)][delta (float)][inv_norm
     // (float)] The last two values are used to dequantize the vector.
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));

From ad3985e994e80c41c5ae8eb8d049ac664adaf322 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Mon, 12 May 2025 18:25:46 +0300
Subject: [PATCH 34/52] format

---
 src/VecSim/spaces/IP/IP.cpp                   | 16 +++---
 src/VecSim/spaces/IP/IP_AVX2_SQ8.h            |  2 +-
 .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h     |  2 +-
 src/VecSim/spaces/IP/IP_NEON_SQ8.h            |  2 +-
 src/VecSim/spaces/IP/IP_SSE4_SQ8.h            |  2 +-
 src/VecSim/spaces/IP/IP_SVE_SQ8.h             |  2 +-
 src/VecSim/spaces/L2/L2.cpp                   |  6 +--
 src/VecSim/spaces/L2/L2_AVX2_SQ8.h            |  2 +-
 .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h     |  2 +-
 src/VecSim/spaces/L2/L2_NEON_SQ8.h            |  4 +-
 src/VecSim/spaces/L2/L2_SSE4_SQ8.h            | 52 ++++++++-----------
 src/VecSim/spaces/L2/L2_SVE_SQ8.h             |  2 +-
 src/VecSim/spaces/functions/AVX.cpp           |  1 -
 src/VecSim/spaces/functions/SSE4.h            |  1 -
 14 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index 6db1a6d77..5e2c4b4dc 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -14,9 +14,8 @@
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
-
-float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, float min_val,
-                         float delta, float inv_norm) {
+float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension,
+                                 float min_val, float delta, float inv_norm) {
     float res = 0;
     for (size_t i = 0; i < dimension; i++) {
         float dequantized_V2 = (pVect2v[i] * delta + min_val);
@@ -28,9 +27,9 @@ float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, s
 float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply it.
-    // it is structured as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)]
-    // The last two values are used to dequantize the vector.
+    // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply
+    // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta
+    // (float)][inv_norm (float)] The last two values are used to dequantize the vector.
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
     // Compute inner product with dequantization
@@ -41,13 +40,14 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio
 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    
+
     // Get quantization parameters
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
     const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
     // Compute inner product with dequantization
-    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
+    const float res =
+        FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
     return 1.0f - res;
 }
 
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
index 8a6d745e3..6ea609f2d 100644
--- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
index 8bc0569da..f2f4efd52 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #pragma once
 #include "VecSim/spaces/space_includes.h"
 #include <immintrin.h>
diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
index b2529439c..3e632dcdb 100644
--- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include <arm_neon.h>
 
diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
index a0b0b02ff..0a6f3ee8c 100644
--- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include <iostream>
 #include <string.h>
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
index 4fe6ad5bb..4beaf81ca 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include <arm_sve.h>
 #include <iostream>
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 6b1774316..a68ea5114 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -18,9 +18,9 @@ using float16 = vecsim_types::float16;
 float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply it.
-    // it structred as [quantized values (uint8_t * dim)][min_val (float)][delta (float)][inv_norm
-    // (float)] The last two values are used to dequantize the vector.
+    // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply
+    // it. it structred as [quantized values (uint8_t * dim)][min_val (float)][delta
+    // (float)][inv_norm (float)] The last two values are used to dequantize the vector.
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
 
diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
index 56346ddb9..2d2702763 100644
--- a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
index c90aa35fd..d2775f5be 100644
--- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 
 // Helper function to perform L2 squared distance calculation for a chunk of 16 elements
diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
index 24f6047a7..e751d1c00 100644
--- a/src/VecSim/spaces/L2/L2_NEON_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
@@ -5,12 +5,12 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include <arm_neon.h>
 
 static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum,
-                            const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
+                             const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
     // Load 4 float elements from pVect1
     float32x4_t v1 = vld1q_f32(pVect1);
     pVect1 += 4;
diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
index 12e7251be..3ee673d3d 100644
--- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
@@ -5,29 +5,29 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include <string.h>
 
 static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum,
-                            const __m128 &min_val_vec, const __m128 &delta_vec) {
+                             const __m128 &min_val_vec, const __m128 &delta_vec) {
     // Load 4 float elements from pVect1
     __m128 v1 = _mm_loadu_ps(pVect1);
     pVect1 += 4;
-    
+
     // Load 4 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2)));
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2)));
     pVect2 += 4;
-    
+
     // Convert int32 to float
     __m128 v2_f = _mm_cvtepi32_ps(v2_i);
-    
+
     // Dequantize: (val * delta) + min_val
     __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
-    
+
     // Compute difference
     __m128 diff = _mm_sub_ps(v1, v2_dequant);
-    
+
     // Square difference and add to sum
     sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
 }
@@ -40,7 +40,7 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime
     // Get dequantization parameters from the end of quantized vector
     const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
     const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    
+
     // Create broadcast vectors for SIMD operations
     __m128 min_val_vec = _mm_set1_ps(min_val);
     __m128 delta_vec = _mm_set1_ps(delta);
@@ -55,49 +55,43 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime
         if constexpr (residual % 4) {
             __m128 v1;
             __m128 v2_dequant = _mm_setzero_ps();
-            
+
             if constexpr (residual % 4 == 3) {
                 // Load 3 floats and set the last one to 0
-                v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0
+                v1 = _mm_load_ss(pVect1);                     // load 1 float, set the rest to 0
                 v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part
-                
+
                 // Dequantize first value
                 float dequant0 = pVect2[0] * delta + min_val;
                 v2_dequant = _mm_load_ss(&dequant0);
-                
+
                 // Dequantize next two values
-                float dequant_high[2] = {
-                    pVect2[1] * delta + min_val,
-                    pVect2[2] * delta + min_val
-                };
+                float dequant_high[2] = {pVect2[1] * delta + min_val, pVect2[2] * delta + min_val};
                 v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high);
-                
+
             } else if constexpr (residual % 4 == 2) {
                 // Load 2 floats and set the last two to 0
                 v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
-                
+
                 // Dequantize two values
-                float dequant_high[2] = {
-                    pVect2[0] * delta + min_val,
-                    pVect2[1] * delta + min_val
-                };
+                float dequant_high[2] = {pVect2[0] * delta + min_val, pVect2[1] * delta + min_val};
                 v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high);
-                
+
             } else if constexpr (residual % 4 == 1) {
                 // Load 1 float and set the last three to 0
                 v1 = _mm_load_ss(pVect1);
-                
+
                 // Dequantize one value
                 float dequant0 = pVect2[0] * delta + min_val;
                 v2_dequant = _mm_load_ss(&dequant0);
             }
-            
+
             pVect1 += residual % 4;
             pVect2 += residual % 4;
-            
+
             // Compute difference
             __m128 diff = _mm_sub_ps(v1, v2_dequant);
-            
+
             // Square difference and initialize sum
             sum = _mm_mul_ps(diff, diff);
         }
@@ -118,7 +112,7 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime
         L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
         L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
     }
-    
+
     // TmpRes must be 16 bytes aligned
     float PORTABLE_ALIGN16 TmpRes[4];
     _mm_store_ps(TmpRes, sum);
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
index 7e3db05d5..8bce46365 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -5,7 +5,7 @@
  * Licensed under your choice of the Redis Source Available License 2.0
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
-*/
+ */
 #include "VecSim/spaces/space_includes.h"
 #include <arm_sve.h>
 
diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp
index 253afce14..4b707a5b5 100644
--- a/src/VecSim/spaces/functions/AVX.cpp
+++ b/src/VecSim/spaces/functions/AVX.cpp
@@ -18,7 +18,6 @@ namespace spaces {
 
 #include "implementation_chooser.h"
 
-
 dist_func_t<float> Choose_FP32_IP_implementation_AVX(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX);
diff --git a/src/VecSim/spaces/functions/SSE4.h b/src/VecSim/spaces/functions/SSE4.h
index 654bf7f94..27bbae0e0 100644
--- a/src/VecSim/spaces/functions/SSE4.h
+++ b/src/VecSim/spaces/functions/SSE4.h
@@ -16,5 +16,4 @@ dist_func_t<float> Choose_SQ8_IP_implementation_SSE4(size_t dim);
 dist_func_t<float> Choose_SQ8_Cosine_implementation_SSE4(size_t dim);
 dist_func_t<float> Choose_SQ8_L2_implementation_SSE4(size_t dim);
 
-
 } // namespace spaces

From 76d2fdd424a5d4271d353fe250f98a23fe3540a7 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 18 May 2025 12:13:34 +0300
Subject: [PATCH 35/52] added fma avx2

---
 cmake/x86_64InstructionFlags.cmake            |   4 +
 src/VecSim/spaces/CMakeLists.txt              |   6 +
 src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h        | 112 +++++++++
 src/VecSim/spaces/IP/IP_AVX2_SQ8.h            |   1 -
 .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h     |   7 +-
 src/VecSim/spaces/IP_space.cpp                |  15 ++
 src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h        | 106 +++++++++
 src/VecSim/spaces/L2_space.cpp                |   8 +
 src/VecSim/spaces/functions/AVX2_FMA.cpp      |  36 +++
 src/VecSim/spaces/functions/AVX2_FMA.h        |  20 ++
 tests/benchmark/spaces_benchmarks/bm_spaces.h |   1 +
 .../spaces_benchmarks/bm_spaces_sq8.cpp       |   6 +
 tests/unit/test_spaces.cpp                    | 217 +++++++++++++-----
 13 files changed, 473 insertions(+), 66 deletions(-)
 create mode 100644 src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
 create mode 100644 src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
 create mode 100644 src/VecSim/spaces/functions/AVX2_FMA.cpp
 create mode 100644 src/VecSim/spaces/functions/AVX2_FMA.h

diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake
index 29281be37..dadd550a8 100644
--- a/cmake/x86_64InstructionFlags.cmake
+++ b/cmake/x86_64InstructionFlags.cmake
@@ -61,6 +61,10 @@ if(CXX_AVX2)
 	add_compile_definitions(OPT_AVX2)
 endif()
 
+if(CXX_AVX2 AND CXX_FMA)
+	add_compile_definitions(OPT_AVX2_FMA)
+endif()
+
 if(CXX_AVX)
 	add_compile_definitions(OPT_AVX)
 endif()
diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index cd6179999..d88750e91 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -56,6 +56,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
 	endif()
 
+	if(CXX_AVX2 AND CXX_FMA)
+		message("Building with AVX2 and FMA")
+		set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
+		list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		message("Building with CXX_F16C")
 		set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
new file mode 100644
index 000000000..822277c93
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
+                                      const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    pVect2 += 8;
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize and compute dot product in one step using FMA
+    // (val * delta) + min_val -> v2_dequant
+    // sum256 += v1 * v2_dequant
+    // Using FMA: sum256 = v1 * v2_dequant + sum256
+
+    // First, compute v2_dequant = v2_f * delta_vec + min_val_vec
+    __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Then, compute sum256 += v1 * v2_dequant using FMA
+    sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize using FMA: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+        // Compute dot product with masking
+        sum256 = _mm256_mul_ps(v1, v2_dequant);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Get dequantization parameters from the end of quantized vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - ip * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
index 6ea609f2d..89b1c0b6b 100644
--- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -67,7 +67,6 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
 
         // Dequantize: (val * delta) + min_val
         __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
-        v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
 
         // Compute dot product with masking
         sum256 = _mm256_mul_ps(v1, v2_dequant);
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
index f2f4efd52..3fd665111 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
@@ -61,8 +61,8 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi
         // Load masked float elements
         __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
 
-        // Load masked uint8 elements
-        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i *>(pVec2));
+        // Load full uint8 elements - we know that the first 16 elements are safe to load
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
         __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
         __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
 
@@ -73,7 +73,7 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi
         __m512 product = _mm512_mul_ps(v1, dequantized);
 
         // Apply mask to product and add to sum
-        sum = _mm512_mask_add_ps(sum, mask, sum, product);
+        sum = _mm512_fmadd_ps(sum, sum, product);
 
         pVec1 += residual;
         pVec2 += residual;
@@ -86,7 +86,6 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi
 
     // Return the raw inner product result
     return _mm512_reduce_add_ps(sum);
-    ;
 }
 
 template <unsigned char residual> // 0..15
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index b7eb828d0..1bcd3a304 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -20,6 +20,7 @@
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/NEON.h"
@@ -74,6 +75,13 @@ dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons
         return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3) {
+        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 16 * sizeof(float); // handles 16 floats
+        return Choose_SQ8_IP_implementation_AVX2_FMA(dim);
+    }
+#endif
 #ifdef OPT_AVX2
     if (features.avx2) {
         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
@@ -133,6 +141,13 @@ dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
         return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3) {
+        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 16 * sizeof(float); // handles 16 floats
+        return Choose_SQ8_Cosine_implementation_AVX2_FMA(dim);
+    }
+#endif
 #ifdef OPT_AVX2
     if (features.avx2) {
         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
new file mode 100644
index 000000000..fd5c38d5a
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
+                             const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    pVect2 += 8;
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize: v2_dequant = v2_f * delta_vec + min_val_vec
+    __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Calculate squared difference using FMA
+    // (v1 - v2_dequant)^2 = v1^2 - 2*v1*v2_dequant + v2_dequant^2
+    // Using FMA: v1^2 - 2*v1*v2_dequant + v2_dequant^2
+    
+    // First, compute v2_dequant^2
+    __m256 v2_dequant_squared = _mm256_mul_ps(v2_dequant, v2_dequant);
+    
+    // Then, compute v1^2
+    __m256 v1_squared = _mm256_mul_ps(v1, v1);
+    
+    // Finally, compute -2*v1*v2_dequant + v2_dequant^2 + v1^2 using FMA
+    // -2*v1*v2_dequant + v2_dequant^2 = -2 * v1 * v2_dequant + v2_dequant^2
+    __m256 neg_2_v1 = _mm256_mul_ps(v1, _mm256_set1_ps(-2.0f));
+    __m256 diff_squared = _mm256_fmadd_ps(neg_2_v1, v2_dequant, v2_dequant_squared);
+    diff_squared = _mm256_add_ps(diff_squared, v1_squared);
+    
+    // Add to running sum
+    sum256 = _mm256_add_ps(sum256, diff_squared);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize using FMA: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+        v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
+
+        // Calculate squared difference
+        __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+        sum256 = _mm256_mul_ps(diff, diff);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum256);
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 6e50a99bb..81f0df91d 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -19,6 +19,7 @@
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/NEON.h"
@@ -74,6 +75,13 @@ dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons
         return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3) {
+        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 16 * sizeof(float); // handles 16 floats
+        return Choose_SQ8_L2_implementation_AVX2_FMA(dim);
+    }
+#endif
 #ifdef OPT_AVX2
     if (features.avx2) {
         if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
diff --git a/src/VecSim/spaces/functions/AVX2_FMA.cpp b/src/VecSim/spaces/functions/AVX2_FMA.cpp
new file mode 100644
index 000000000..4dc627c57
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_FMA.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "AVX2_FMA.h"
+#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h"
+#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+// FMA optimized implementations
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2_FMA.h b/src/VecSim/spaces/functions/AVX2_FMA.h
new file mode 100644
index 000000000..80d5adb6d
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_FMA.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim);
+
+
+} // namespace spaces
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index d10e3ac76..d99bcc4ca 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -24,6 +24,7 @@
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/SSE3.h"
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
index d780b8285..8e7140bba 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -69,6 +69,12 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 1
                                  avx512_f_bw_vl_vnni_supported);
 #endif // AVX512_F_BW_VL_VNNI
 
+#ifdef OPT_AVX2_FMA
+bool avx2_fma3_supported = opt.avx2 && opt.fma3;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported);
+#endif // AVX2_FMA
+
 #ifdef AVX2
 // AVX2 functions
 bool avx2_supported = opt.avx2;
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index c9addd484..b660562d3 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -23,13 +23,14 @@
 #include "VecSim/spaces/functions/AVX512F.h"
 #include "VecSim/spaces/functions/AVX.h"
 #include "VecSim/spaces/functions/SSE.h"
-#include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/NEON.h"
 #include "VecSim/spaces/functions/NEON_DOTPROD.h"
@@ -317,16 +318,43 @@ void common_ip_sq8(bool should_normalize, float expected_dist) {
         v2_orig[i] = float(i + 1.5);
     }
 
+    // Create SQ8 compressed version of v2
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
     size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
-    std::vector<uint8_t> v2_compressed(compressed_size);
     if (should_normalize) {
         spaces::GetNormalizeFunc<float>()(v1_orig, dim);
         spaces::GetNormalizeFunc<float>()(v2_orig, dim);
     }
 
-    // Create SQ8 compressed version of v2
-    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
-    test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data());
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    std::vector<uint8_t> v2_compressed(compressed_size);
+
+    // Quantize v2
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
 
     float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
 
@@ -354,20 +382,47 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) {
         v2_orig[i] = float(i + 1.5);
     }
 
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
     size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
-    std::vector<uint8_t> v2_compressed(compressed_size);
-
     spaces::GetNormalizeFunc<float>()(v1_orig, dim);
-    spaces::GetNormalizeFunc<float>()(v2_orig, dim);
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
 
-    // Create SQ8 compressed version of v2
-    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
-    test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data());
+    // Compress v2
+    std::vector<uint8_t> v2_compressed(compressed_size);
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
+    // Calculate inverse norm from decompressed values
+    float inv_norm = 0.0f;
+    for (size_t i = 0; i < dim; i++) {
+        float decompressed_value = min_val + quant_values[i] * delta;
+        inv_norm += decompressed_value * decompressed_value;
+    }
+    inv_norm = 1.0f / std::sqrt(inv_norm);
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
 
     float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
     ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance";
 }
-
 TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
     // create a vector with extra space for the norm
     size_t dim = 5;
@@ -2063,7 +2118,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         v1_orig[i] = float(i + 1.5);
         v2_orig[i] = float(i * 0.75 + 1.0);
     }
-
+    
     // Create SQ8 compressed version of v2
     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
 
@@ -2074,8 +2129,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
-// Test different optimizations based on CPU features
-#ifdef OPT_AVX512_F_BW_VL_VNNI
+    // Test different optimizations based on CPU features
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2087,21 +2142,34 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = 0;
     }
-#endif
-#ifdef OPT_AVX2
+    #endif
+    #ifdef OPT_AVX2_FMA
+    if (optimization.avx2 && optimization.fma3) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx2 = optimization.fma3 = 0;
+    }
+    #endif
+    #ifdef OPT_AVX2
     if (optimization.avx2) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim))
             << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
-        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX2 with dim " << dim;
-        // Unset avx flag as well, so we'll choose the next optimization (SSE4).
+        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
+        // Unset avx flag as well, so we'll choose the next optimization (SSE).
         optimization.avx2 = 0;
     }
-#endif
-#ifdef OPT_SSE4
+    #endif
+    #ifdef OPT_SSE4
     if (optimization.sse4_1) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2109,13 +2177,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SSE with dim " << dim;
-        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE4 with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
         // Unset sse flag as well, so we'll choose the next optimization (default).
         optimization.sse4_1 = 0;
     }
-#endif
+    #endif
 
-#ifdef OPT_SVE2
+    #ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2127,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sve2 flag as well, so we'll choose the next option (default).
         optimization.sve2 = 0;
     }
-#endif
-#ifdef OPT_SVE
+    #endif
+    #ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2140,20 +2208,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sve flag as well, so we'll choose the next option (default).
         optimization.sve = 0;
     }
-#endif
-#ifdef OPT_NEON
+    #endif
+    #ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim))
             << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "NEON with dim " << dim;
         ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.asimd = 0;
     }
-#endif
+    #endif
+
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2190,8 +2259,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
 
-// Test different optimizations based on CPU features
-#ifdef OPT_AVX512_F_BW_VL_VNNI
+    // Test different optimizations based on CPU features
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2202,8 +2271,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
-#endif
-#ifdef OPT_AVX
+    #endif
+    #ifdef OPT_AVX2_FMA
+    if (optimization.avx2 && optimization.fma3) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
+        optimization.avx2 = optimization.fma3 = 0;
+    }
+    #endif
+    #ifdef OPT_AVX2
     if (optimization.avx2) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2214,8 +2295,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         optimization.avx2 = 0;
     }
-#endif
-#ifdef OPT_SSE4
+    #endif
+    #ifdef OPT_SSE
     if (optimization.sse4_1) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2223,24 +2304,24 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SSE with dim " << dim;
-        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE4 with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
         optimization.sse4_1 = 0;
     }
-#endif
-#ifdef OPT_SVE2
+    #endif
+    #ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim))
             << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
         ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
         // Unset sve2 flag as well, so we'll choose the next option (default).
         optimization.sve2 = 0;
     }
-#endif
-#ifdef OPT_SVE
+    #endif
+    #ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2252,8 +2333,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // Unset sve flag as well, so we'll choose the next option (default).
         optimization.sve = 0;
     }
-#endif
-#ifdef OPT_NEON
+    #endif
+    #ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2265,7 +2346,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.asimd = 0;
     }
-#endif
+    #endif
+
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2308,7 +2390,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim);
 
-#ifdef OPT_SVE2
+    #ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2320,8 +2402,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim;
         optimization.sve2 = 0;
     }
-#endif
-#ifdef OPT_SVE
+    #endif
+    #ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2333,8 +2415,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim;
         optimization.sve = 0;
     }
-#endif
-#ifdef OPT_NEON
+    #endif
+    #ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2346,10 +2428,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim;
         optimization.asimd = 0;
     }
-#endif
+    #endif
 
-// Test different optimizations based on CPU features
-#ifdef OPT_AVX512_F_BW_VL_VNNI
+    // Test different optimizations based on CPU features
+    #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2361,8 +2443,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
-#endif
-#ifdef OPT_AVX2
+    #endif
+    #ifdef OPT_AVX2_FMA
+    if (optimization.avx2 && optimization.fma3) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // We don't align SQ8 vectors with cosine distance
+        // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
+        optimization.avx2 = optimization.fma3 = 0;
+    }
+    #endif
+    #ifdef OPT_AVX2
     if (optimization.avx2) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2374,9 +2469,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
         optimization.avx2 = 0;
     }
-#endif
+    #endif
 
-#ifdef OPT_SSE4
+    #ifdef OPT_SSE
     if (optimization.sse4_1) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2388,13 +2483,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim;
         optimization.sse4_1 = 0;
     }
-#endif
+    #endif
 
     // Test default implementation
     unsigned char alignment = 0;
     arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
-    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim;
-    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " <<
+    dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
         << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }

From b47cc5239c1a7a89c2d62fafdaebee66360cdfce Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 18 May 2025 12:36:01 +0300
Subject: [PATCH 36/52] format

---
 src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h |   5 +-
 src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h |  10 +--
 src/VecSim/spaces/functions/AVX2_FMA.h |   1 -
 tests/unit/test_spaces.cpp             | 104 ++++++++++++-------------
 4 files changed, 59 insertions(+), 61 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
index 822277c93..007ee333e 100644
--- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
@@ -9,8 +9,9 @@
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
 
-static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
-                                      const __m256 &min_val_vec, const __m256 &delta_vec) {
+static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2,
+                                           __m256 &sum256, const __m256 &min_val_vec,
+                                           const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
     pVect1 += 8;
diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
index fd5c38d5a..75ae892f9 100644
--- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
@@ -10,7 +10,7 @@
 #include "VecSim/spaces/AVX_utils.h"
 
 static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
-                             const __m256 &min_val_vec, const __m256 &delta_vec) {
+                                 const __m256 &min_val_vec, const __m256 &delta_vec) {
     // Load 8 float elements from pVect1
     __m256 v1 = _mm256_loadu_ps(pVect1);
     pVect1 += 8;
@@ -31,19 +31,19 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _
     // Calculate squared difference using FMA
     // (v1 - v2_dequant)^2 = v1^2 - 2*v1*v2_dequant + v2_dequant^2
     // Using FMA: v1^2 - 2*v1*v2_dequant + v2_dequant^2
-    
+
     // First, compute v2_dequant^2
     __m256 v2_dequant_squared = _mm256_mul_ps(v2_dequant, v2_dequant);
-    
+
     // Then, compute v1^2
     __m256 v1_squared = _mm256_mul_ps(v1, v1);
-    
+
     // Finally, compute -2*v1*v2_dequant + v2_dequant^2 + v1^2 using FMA
     // -2*v1*v2_dequant + v2_dequant^2 = -2 * v1 * v2_dequant + v2_dequant^2
     __m256 neg_2_v1 = _mm256_mul_ps(v1, _mm256_set1_ps(-2.0f));
     __m256 diff_squared = _mm256_fmadd_ps(neg_2_v1, v2_dequant, v2_dequant_squared);
     diff_squared = _mm256_add_ps(diff_squared, v1_squared);
-    
+
     // Add to running sum
     sum256 = _mm256_add_ps(sum256, diff_squared);
 }
diff --git a/src/VecSim/spaces/functions/AVX2_FMA.h b/src/VecSim/spaces/functions/AVX2_FMA.h
index 80d5adb6d..b81dfd5ab 100644
--- a/src/VecSim/spaces/functions/AVX2_FMA.h
+++ b/src/VecSim/spaces/functions/AVX2_FMA.h
@@ -16,5 +16,4 @@ dist_func_t<float> Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim);
 dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim);
 dist_func_t<float> Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim);
 
-
 } // namespace spaces
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index b660562d3..e1ba1a1bd 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2118,7 +2118,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         v1_orig[i] = float(i + 1.5);
         v2_orig[i] = float(i * 0.75 + 1.0);
     }
-    
+
     // Create SQ8 compressed version of v2
     std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
 
@@ -2129,8 +2129,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
-    // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2142,8 +2142,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = 0;
     }
-    #endif
-    #ifdef OPT_AVX2_FMA
+#endif
+#ifdef OPT_AVX2_FMA
     if (optimization.avx2 && optimization.fma3) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2155,21 +2155,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx2 = optimization.fma3 = 0;
     }
-    #endif
-    #ifdef OPT_AVX2
+#endif
+#ifdef OPT_AVX2
     if (optimization.avx2) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim))
             << "Unexpected distance function chosen for dim " << dim;
-            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         // Unset avx flag as well, so we'll choose the next optimization (SSE).
         optimization.avx2 = 0;
     }
-    #endif
-    #ifdef OPT_SSE4
+#endif
+#ifdef OPT_SSE4
     if (optimization.sse4_1) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2181,9 +2181,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sse flag as well, so we'll choose the next optimization (default).
         optimization.sse4_1 = 0;
     }
-    #endif
+#endif
 
-    #ifdef OPT_SVE2
+#ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2195,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sve2 flag as well, so we'll choose the next option (default).
         optimization.sve2 = 0;
     }
-    #endif
-    #ifdef OPT_SVE
+#endif
+#ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2208,21 +2208,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
         // Unset sve flag as well, so we'll choose the next option (default).
         optimization.sve = 0;
     }
-    #endif
-    #ifdef OPT_NEON
+#endif
+#ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim))
             << "Unexpected distance function chosen for dim " << dim;
-            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "NEON with dim " << dim;
         ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.asimd = 0;
     }
-    #endif
-
+#endif
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2259,8 +2258,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
 
-    // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2271,8 +2270,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
-    #endif
-    #ifdef OPT_AVX2_FMA
+#endif
+#ifdef OPT_AVX2_FMA
     if (optimization.avx2 && optimization.fma3) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2283,8 +2282,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         optimization.avx2 = optimization.fma3 = 0;
     }
-    #endif
-    #ifdef OPT_AVX2
+#endif
+#ifdef OPT_AVX2
     if (optimization.avx2) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2295,8 +2294,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         optimization.avx2 = 0;
     }
-    #endif
-    #ifdef OPT_SSE
+#endif
+#ifdef OPT_SSE
     if (optimization.sse4_1) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2307,21 +2306,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
         optimization.sse4_1 = 0;
     }
-    #endif
-    #ifdef OPT_SVE2
+#endif
+#ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim))
             << "Unexpected distance function chosen for dim " << dim;
-            ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
         ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
         // Unset sve2 flag as well, so we'll choose the next option (default).
         optimization.sve2 = 0;
     }
-    #endif
-    #ifdef OPT_SVE
+#endif
+#ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2333,8 +2332,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // Unset sve flag as well, so we'll choose the next option (default).
         optimization.sve = 0;
     }
-    #endif
-    #ifdef OPT_NEON
+#endif
+#ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2346,8 +2345,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.asimd = 0;
     }
-    #endif
-
+#endif
 
     // Test default implementation
     unsigned char alignment = 0;
@@ -2390,7 +2388,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim);
 
-    #ifdef OPT_SVE2
+#ifdef OPT_SVE2
     if (optimization.sve2) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2402,8 +2400,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim;
         optimization.sve2 = 0;
     }
-    #endif
-    #ifdef OPT_SVE
+#endif
+#ifdef OPT_SVE
     if (optimization.sve) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2415,8 +2413,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim;
         optimization.sve = 0;
     }
-    #endif
-    #ifdef OPT_NEON
+#endif
+#ifdef OPT_NEON
     if (optimization.asimd) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2428,10 +2426,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim;
         optimization.asimd = 0;
     }
-    #endif
+#endif
 
-    // Test different optimizations based on CPU features
-    #ifdef OPT_AVX512_F_BW_VL_VNNI
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2443,8 +2441,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
-    #endif
-    #ifdef OPT_AVX2_FMA
+#endif
+#ifdef OPT_AVX2_FMA
     if (optimization.avx2 && optimization.fma3) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2456,8 +2454,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
         optimization.avx2 = optimization.fma3 = 0;
     }
-    #endif
-    #ifdef OPT_AVX2
+#endif
+#ifdef OPT_AVX2
     if (optimization.avx2) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2469,9 +2467,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
         optimization.avx2 = 0;
     }
-    #endif
+#endif
 
-    #ifdef OPT_SSE
+#ifdef OPT_SSE
     if (optimization.sse4_1) {
         unsigned char alignment = 0;
         arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
@@ -2483,13 +2481,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
         // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim;
         optimization.sse4_1 = 0;
     }
-    #endif
+#endif
 
     // Test default implementation
     unsigned char alignment = 0;
     arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
-    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " <<
-    dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
         << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }

From 6566a0b7a3cf60a55abf9cb2f5ad361368ebddf0 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 18 May 2025 14:14:25 +0300
Subject: [PATCH 37/52] remove opt.avx2

---
 tests/unit/test_spaces.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index e1ba1a1bd..afdf7d01d 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2153,7 +2153,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
             << "AVX with dim " << dim;
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
-        optimization.avx2 = optimization.fma3 = 0;
+        optimization.fma3 = 0;
     }
 #endif
 #ifdef OPT_AVX2
@@ -2280,7 +2280,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
         // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
-        optimization.avx2 = optimization.fma3 = 0;
+        optimization.fma3 = 0;
     }
 #endif
 #ifdef OPT_AVX2
@@ -2452,7 +2452,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "AVX with dim " << dim;
         // We don't align SQ8 vectors with cosine distance
         // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
-        optimization.avx2 = optimization.fma3 = 0;
+        optimization.fma3 = 0;
     }
 #endif
 #ifdef OPT_AVX2

From d767ea92f6ce1408863165ed79c3ef249aa75b3c Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 18 May 2025 15:20:27 +0300
Subject: [PATCH 38/52] fix OPT_AVX2 bm-spaces

---
 tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
index 8e7140bba..1349a3512 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -75,7 +75,7 @@ INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma
 INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported);
 #endif // AVX2_FMA
 
-#ifdef AVX2
+#ifdef OPT_AVX2
 // AVX2 functions
 bool avx2_supported = opt.avx2;
 INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported);

From ea0ac003733097a9e5e054a3887ac84d73d12cd3 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Wed, 21 May 2025 17:19:39 +0300
Subject: [PATCH 39/52] pr chanes

---
 src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 51 +++++++++++++++++-------------
 src/VecSim/spaces/IP/IP_SVE_SQ8.h  |  8 ++---
 src/VecSim/spaces/L2/L2_SVE_SQ8.h  |  8 ++---
 3 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
index 0a6f3ee8c..9822b03fb 100644
--- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
@@ -10,24 +10,26 @@
 #include <iostream>
 #include <string.h>
 
-static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod,
-                                    const __m128 &min_val_vec, const __m128 &delta_vec) {
-    // Load 4 float elements from pVect1
-    __m128 v1 = _mm_loadu_ps(pVect1);
-    pVect1 += 4;
-
-    // Load 4 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2)));
-    pVect2 += 4;
-
-    // Convert int32 to float
-    __m128 v2_f = _mm_cvtepi32_ps(v2_i);
-
-    // Dequantize: (val * delta) + min_val
-    __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
-
-    // Compute dot product and add to sum
-    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant));
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, 
+                                   __m128 &sum_prod1, __m128 &sum_prod2,
+                                   const __m128 &min_val_vec, const __m128 &delta_vec) {
+    // Load first 4 elements
+    __m128 v1a = _mm_loadu_ps(pVect1);
+    __m128i v2a_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2));
+    
+    // Load next 4 elements
+    __m128 v1b = _mm_loadu_ps(pVect1 + 4);
+    __m128i v2b_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2 + 4));
+    
+    pVect1 += 8;
+    pVect2 += 8;
+
+    // Process both sets
+    __m128 v2a_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2a_i), delta_vec), min_val_vec);
+    __m128 v2b_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2b_i), delta_vec), min_val_vec);
+    
+    sum_prod1 = _mm_add_ps(sum_prod1, _mm_mul_ps(v1a, v2a_dequant));
+    sum_prod2 = _mm_add_ps(sum_prod2, _mm_mul_ps(v1b, v2b_dequant));
 }
 
 template <unsigned char residual> // 0..15
@@ -45,7 +47,9 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v,
 
     const float *pEnd1 = pVect1 + dimension;
 
-    __m128 sum = _mm_setzero_ps();
+    // Initialize two sum accumulators
+    __m128 sum1 = _mm_setzero_ps();
+    __m128 sum2 = _mm_setzero_ps();
 
     // Process residual elements if needed
     if constexpr (residual) {
@@ -86,15 +90,18 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v,
 
             pVect1 += residual % 4;
             quantized += residual % 4;
-            sum = _mm_mul_ps(v1, v2_dequant);
+            sum1 = _mm_mul_ps(v1, v2_dequant); // Use sum1 for residual
         }
     }
 
-    // Process 4 elements at a time
+    // Process 8 elements at a time
     while (pVect1 < pEnd1) {
-        InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, quantized, sum1, sum2, min_val_vec, delta_vec);
     }
 
+    // Combine the two sums
+    __m128 sum = _mm_add_ps(sum1, sum2);
+
     // TmpRes must be 16 bytes aligned.
     float PORTABLE_ALIGN16 TmpRes[4];
     _mm_store_ps(TmpRes, sum);
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
index 4beaf81ca..116bd8325 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -23,13 +23,13 @@ static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2
     svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa
 
     // Convert uint32 to float32
-    svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
+    svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32);
 
     // Dequantize: (val * delta) + min_val
-    svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec);
+    svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec);
 
     // Compute dot product and add to sum
-    sum = svmla_f32_z(pg, sum, v1, v2_dequant);
+    sum = svmla_f32_x(pg, sum, v1, v2_dequant);
 
     // Move to the next set of elements
     offset += svcntw();
@@ -80,7 +80,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
 
             // Dequantize: (val * delta) + min_val
             svfloat32_t v2_dequant =
-                svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
+                svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec);
 
             // Compute dot product and add to sum
             sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant);
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
index 8bce46365..4ab24b297 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -21,16 +21,16 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_
     svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset);
 
     // Convert uint32 to float32
-    svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32);
+    svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32);
 
     // Dequantize: (val * delta) + min_val
-    svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec);
+    svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec);
 
     // Compute difference
-    svfloat32_t diff = svsub_f32_z(pg, v1, v2_dequant);
+    svfloat32_t diff = svsub_f32_x(pg, v1, v2_dequant);
 
     // Square difference and add to sum
-    sum = svmla_f32_z(pg, sum, diff, diff);
+    sum = svmla_f32_x(pg, sum, diff, diff);
 
     // Move to the next set of elements
     offset += svcntw();

From ef09ead6d9fb830944c71303c6a26b2b4fcf457d Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 22 May 2025 09:35:19 +0300
Subject: [PATCH 40/52] format

---
 src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 12 ++++++------
 src/VecSim/spaces/IP/IP_SVE_SQ8.h  |  3 +--
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
index 9822b03fb..b32989838 100644
--- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
@@ -10,24 +10,24 @@
 #include <iostream>
 #include <string.h>
 
-static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, 
-                                   __m128 &sum_prod1, __m128 &sum_prod2,
-                                   const __m128 &min_val_vec, const __m128 &delta_vec) {
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod1,
+                                    __m128 &sum_prod2, const __m128 &min_val_vec,
+                                    const __m128 &delta_vec) {
     // Load first 4 elements
     __m128 v1a = _mm_loadu_ps(pVect1);
     __m128i v2a_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2));
-    
+
     // Load next 4 elements
     __m128 v1b = _mm_loadu_ps(pVect1 + 4);
     __m128i v2b_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2 + 4));
-    
+
     pVect1 += 8;
     pVect2 += 8;
 
     // Process both sets
     __m128 v2a_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2a_i), delta_vec), min_val_vec);
     __m128 v2b_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2b_i), delta_vec), min_val_vec);
-    
+
     sum_prod1 = _mm_add_ps(sum_prod1, _mm_mul_ps(v1a, v2a_dequant));
     sum_prod2 = _mm_add_ps(sum_prod2, _mm_mul_ps(v1b, v2b_dequant));
 }
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
index 116bd8325..863ef3652 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -79,8 +79,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
             svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
 
             // Dequantize: (val * delta) + min_val
-            svfloat32_t v2_dequant =
-                svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec);
+            svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec);
 
             // Compute dot product and add to sum
             sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant);

From 7567730949ca029e2f5bafe0f17fce6d5651dc71 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 22 May 2025 09:56:14 +0300
Subject: [PATCH 41/52] change to _mm_cvtsi32_si128

---
 src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 51 +++++++++++++-----------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
index b32989838..67ebc4547 100644
--- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
@@ -10,26 +10,24 @@
 #include <iostream>
 #include <string.h>
 
-static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod1,
-                                    __m128 &sum_prod2, const __m128 &min_val_vec,
-                                    const __m128 &delta_vec) {
-    // Load first 4 elements
-    __m128 v1a = _mm_loadu_ps(pVect1);
-    __m128i v2a_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2));
-
-    // Load next 4 elements
-    __m128 v1b = _mm_loadu_ps(pVect1 + 4);
-    __m128i v2b_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2 + 4));
-
-    pVect1 += 8;
-    pVect2 += 8;
-
-    // Process both sets
-    __m128 v2a_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2a_i), delta_vec), min_val_vec);
-    __m128 v2b_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2b_i), delta_vec), min_val_vec);
-
-    sum_prod1 = _mm_add_ps(sum_prod1, _mm_mul_ps(v1a, v2a_dequant));
-    sum_prod2 = _mm_add_ps(sum_prod2, _mm_mul_ps(v1b, v2b_dequant));
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod,
+                                    const __m128 &min_val_vec, const __m128 &delta_vec) {
+    // Load 4 float elements from pVect1
+    __m128 v1 = _mm_loadu_ps(pVect1);
+    pVect1 += 4;
+
+    // Load 4 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2));
+    pVect2 += 4;
+
+    // Convert int32 to float
+    __m128 v2_f = _mm_cvtepi32_ps(v2_i);
+
+    // Dequantize: (val * delta) + min_val
+    __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
+
+    // Compute dot product and add to sum
+    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant));
 }
 
 template <unsigned char residual> // 0..15
@@ -47,9 +45,7 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v,
 
     const float *pEnd1 = pVect1 + dimension;
 
-    // Initialize two sum accumulators
-    __m128 sum1 = _mm_setzero_ps();
-    __m128 sum2 = _mm_setzero_ps();
+    __m128 sum = _mm_setzero_ps();
 
     // Process residual elements if needed
     if constexpr (residual) {
@@ -90,18 +86,15 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v,
 
             pVect1 += residual % 4;
             quantized += residual % 4;
-            sum1 = _mm_mul_ps(v1, v2_dequant); // Use sum1 for residual
+            sum = _mm_mul_ps(v1, v2_dequant);
         }
     }
 
-    // Process 8 elements at a time
+    // Process 4 elements at a time
     while (pVect1 < pEnd1) {
-        InnerProductStep(pVect1, quantized, sum1, sum2, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec);
     }
 
-    // Combine the two sums
-    __m128 sum = _mm_add_ps(sum1, sum2);
-
     // TmpRes must be 16 bytes aligned.
     float PORTABLE_ALIGN16 TmpRes[4];
     _mm_store_ps(TmpRes, sum);

From a767547476c75edf1d3f4ec24b0e1231c15a1c76 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 22 May 2025 10:07:38 +0300
Subject: [PATCH 42/52] Change in the l2

---
 src/VecSim/spaces/L2/L2_SSE4_SQ8.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
index 3ee673d3d..16b60286b 100644
--- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
@@ -16,7 +16,7 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m12
     pVect1 += 4;
 
     // Load 4 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2)));
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2));
     pVect2 += 4;
 
     // Convert int32 to float

From e6422dc40bd122b68f931ab61b3e585f4b3387a2 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Tue, 27 May 2025 09:17:06 +0300
Subject: [PATCH 43/52] PR changes

---
 src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 19 ++++---------------
 src/VecSim/spaces/L2/L2_AVX2_SQ8.h     | 11 +++--------
 2 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
index 75ae892f9..2cff76a31 100644
--- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 2006-Present, Redis Ltd.
  * All rights reserved.
@@ -28,21 +29,9 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _
     // Dequantize: v2_dequant = v2_f * delta_vec + min_val_vec
     __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
 
-    // Calculate squared difference using FMA
-    // (v1 - v2_dequant)^2 = v1^2 - 2*v1*v2_dequant + v2_dequant^2
-    // Using FMA: v1^2 - 2*v1*v2_dequant + v2_dequant^2
-
-    // First, compute v2_dequant^2
-    __m256 v2_dequant_squared = _mm256_mul_ps(v2_dequant, v2_dequant);
-
-    // Then, compute v1^2
-    __m256 v1_squared = _mm256_mul_ps(v1, v1);
-
-    // Finally, compute -2*v1*v2_dequant + v2_dequant^2 + v1^2 using FMA
-    // -2*v1*v2_dequant + v2_dequant^2 = -2 * v1 * v2_dequant + v2_dequant^2
-    __m256 neg_2_v1 = _mm256_mul_ps(v1, _mm256_set1_ps(-2.0f));
-    __m256 diff_squared = _mm256_fmadd_ps(neg_2_v1, v2_dequant, v2_dequant_squared);
-    diff_squared = _mm256_add_ps(diff_squared, v1_squared);
+    // Calculate squared difference - simple and efficient approach
+    __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+    __m256 diff_squared = _mm256_mul_ps(diff, diff);
 
     // Add to running sum
     sum256 = _mm256_add_ps(sum256, diff_squared);
diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
index 2d2702763..bdde99e62 100644
--- a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
@@ -58,13 +58,8 @@ float SQ8_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dime
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
 
-        uint8_t temp_buf[8] = {0};
-        // Manually copy elements
-        for (size_t i = 0; i < residual % 8; i++) {
-            temp_buf[i] = pVect2[i];
-        }
-        // Load from buffer
-        __m128i v2_128 = _mm_loadl_epi64((__m128i *)temp_buf);
+        // Direct load - safe because we only process the masked elements
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
         pVect2 += residual % 8;
 
         // Zero-extend uint8 to int32
@@ -76,10 +71,10 @@ float SQ8_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dime
         // Dequantize: (val * delta) + min_val
         __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
 
+        // Apply mask to zero out unused elements
         v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
 
         __m256 diff = _mm256_sub_ps(v1, v2_dequant);
-
         sum = _mm256_mul_ps(diff, diff);
     }
 

From 10a609865276f39c5ee4879eae6ec2a46f2c4839 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Tue, 27 May 2025 11:51:24 +0300
Subject: [PATCH 44/52] added chunk to functions

---
 src/VecSim/spaces/IP/IP_SVE_FP32.h | 22 +++++++++++-----------
 src/VecSim/spaces/IP/IP_SVE_SQ8.h  | 26 +++++++++++++-------------
 src/VecSim/spaces/L2/L2_SVE_FP32.h | 23 ++++++++++++-----------
 src/VecSim/spaces/L2/L2_SVE_SQ8.h  | 24 ++++++++++++------------
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SVE_FP32.h b/src/VecSim/spaces/IP/IP_SVE_FP32.h
index c60acb16a..c1cc79ccd 100644
--- a/src/VecSim/spaces/IP/IP_SVE_FP32.h
+++ b/src/VecSim/spaces/IP/IP_SVE_FP32.h
@@ -11,13 +11,13 @@
 #include <arm_sve.h>
 
 static inline void InnerProductStep(float *&pVect1, float *&pVect2, size_t &offset,
-                                    svfloat32_t &sum) {
+                                    svfloat32_t &sum, const size_t chunk) {
     svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset);
     svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset);
 
     sum = svmla_f32_x(svptrue_b32(), sum, v1, v2);
 
-    offset += svcntw();
+    offset += chunk;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
@@ -26,33 +26,33 @@ float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t
     float *pVect2 = (float *)pVect2v;
     size_t offset = 0;
 
-    uint64_t sve_word_count = svcntw();
+    uint64_t chunk = svcntw();
 
     svfloat32_t sum0 = svdup_f32(0.0f);
     svfloat32_t sum1 = svdup_f32(0.0f);
     svfloat32_t sum2 = svdup_f32(0.0f);
     svfloat32_t sum3 = svdup_f32(0.0f);
 
-    auto chunk_size = 4 * sve_word_count;
+    auto chunk_size = 4 * chunk;
     const size_t number_of_chunks = dimension / chunk_size;
     for (size_t i = 0; i < number_of_chunks; i++) {
-        InnerProductStep(pVect1, pVect2, offset, sum0);
-        InnerProductStep(pVect1, pVect2, offset, sum1);
-        InnerProductStep(pVect1, pVect2, offset, sum2);
-        InnerProductStep(pVect1, pVect2, offset, sum3);
+        InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum3, chunk);
     }
 
     // Process remaining complete SVE vectors that didn't fit into the main loop
     // These are full vector operations (0-3 elements)
     if constexpr (additional_steps > 0) {
         if constexpr (additional_steps >= 1) {
-            InnerProductStep(pVect1, pVect2, offset, sum0);
+            InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
         }
         if constexpr (additional_steps >= 2) {
-            InnerProductStep(pVect1, pVect2, offset, sum1);
+            InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
         }
         if constexpr (additional_steps >= 3) {
-            InnerProductStep(pVect1, pVect2, offset, sum3);
+            InnerProductStep(pVect1, pVect2, offset, sum3, chunk);
         }
     }
 
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
index 863ef3652..7b9bd86bc 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -13,7 +13,7 @@
 
 static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
                                     svfloat32_t &sum, const svfloat32_t &min_val_vec,
-                                    const svfloat32_t &delta_vec) {
+                                    const svfloat32_t &delta_vec, const size_t chunk) {
     svbool_t pg = svptrue_b32();
 
     // Load float elements from pVect1
@@ -32,7 +32,7 @@ static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2
     sum = svmla_f32_x(pg, sum, v1, v2_dequant);
 
     // Move to the next set of elements
-    offset += svcntw();
+    offset += chunk;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
@@ -51,7 +51,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
     svfloat32_t delta_vec = svdup_f32(delta);
 
     // Get the number of 32-bit elements per vector at runtime
-    uint64_t sve_word_count = svcntw();
+    uint64_t chunk = svcntw();
 
     // Multiple accumulators to increase instruction-level parallelism
     svfloat32_t sum0 = svdup_f32(0.0f);
@@ -61,7 +61,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
 
     // Handle partial chunk if needed
     if constexpr (partial_chunk) {
-        size_t remaining = dimension % sve_word_count;
+        size_t remaining = dimension % chunk;
         if (remaining > 0) {
             // Create predicate for the remaining elements
             svbool_t pg_partial =
@@ -90,26 +90,26 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz
     }
 
     // Process 4 chunks at a time in the main loop
-    auto chunk_size = 4 * sve_word_count;
+    auto chunk_size = 4 * chunk;
     const size_t number_of_chunks =
-        (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size;
+        (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
 
     for (size_t i = 0; i < number_of_chunks; i++) {
-        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
-        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
-        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
-        InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk);
     }
 
     // Handle remaining steps (0-3)
     if constexpr (additional_steps > 0) {
-        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
     }
     if constexpr (additional_steps > 1) {
-        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
     }
     if constexpr (additional_steps > 2) {
-        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
     }
 
     // Combine the accumulators
diff --git a/src/VecSim/spaces/L2/L2_SVE_FP32.h b/src/VecSim/spaces/L2/L2_SVE_FP32.h
index a3e96c7a8..8367baa97 100644
--- a/src/VecSim/spaces/L2/L2_SVE_FP32.h
+++ b/src/VecSim/spaces/L2/L2_SVE_FP32.h
@@ -9,7 +9,8 @@
 #include "VecSim/spaces/space_includes.h"
 #include <arm_sve.h>
 
-static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum) {
+static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum,
+                                const size_t chunk) {
     // Load vectors
     svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset);
     svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset);
@@ -21,7 +22,7 @@ static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset,
     sum = svmla_f32_z(svptrue_b32(), sum, diff, diff);
 
     // Advance pointers by the vector length
-    offset += svcntw();
+    offset += chunk;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
@@ -31,7 +32,7 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens
     size_t offset = 0;
 
     // Get the number of 32-bit elements per vector at runtime
-    uint64_t sve_word_count = svcntw();
+    uint64_t chunk = svcntw();
 
     // Multiple accumulators to increase instruction-level parallelism
     svfloat32_t sum0 = svdup_f32(0.0f);
@@ -40,27 +41,27 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens
     svfloat32_t sum3 = svdup_f32(0.0f);
 
     // Process vectors in chunks, with unrolling for better pipelining
-    auto chunk_size = 4 * sve_word_count;
+    auto chunk_size = 4 * chunk;
     size_t number_of_chunks = dimension / chunk_size;
     for (size_t i = 0; i < number_of_chunks; ++i) {
         // Process 4 chunks with separate accumulators
-        L2SquareStep(pVect1, pVect2, offset, sum0);
-        L2SquareStep(pVect1, pVect2, offset, sum1);
-        L2SquareStep(pVect1, pVect2, offset, sum2);
-        L2SquareStep(pVect1, pVect2, offset, sum3);
+        L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
+        L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
+        L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
+        L2SquareStep(pVect1, pVect2, offset, sum3, chunk);
     }
 
     // Process remaining complete SVE vectors that didn't fit into the main loop
     // These are full vector operations (0-3 elements)
     if constexpr (additional_steps > 0) {
         if constexpr (additional_steps >= 1) {
-            L2SquareStep(pVect1, pVect2, offset, sum0);
+            L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
         }
         if constexpr (additional_steps >= 2) {
-            L2SquareStep(pVect1, pVect2, offset, sum1);
+            L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
         }
         if constexpr (additional_steps >= 3) {
-            L2SquareStep(pVect1, pVect2, offset, sum2);
+            L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
         }
     }
 
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
index 4ab24b297..756f82522 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -11,7 +11,7 @@
 
 static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
                              svfloat32_t &sum, const svfloat32_t &min_val_vec,
-                             const svfloat32_t &delta_vec) {
+                             const svfloat32_t &delta_vec, const size_t chunk) {
     svbool_t pg = svptrue_b32();
 
     // Load float elements from pVect1
@@ -33,7 +33,7 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_
     sum = svmla_f32_x(pg, sum, diff, diff);
 
     // Move to the next set of elements
-    offset += svcntw();
+    offset += chunk;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
@@ -52,7 +52,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
     svfloat32_t delta_vec = svdup_f32(delta);
 
     // Get the number of 32-bit elements per vector at runtime
-    uint64_t sve_word_count = svcntw();
+    uint64_t chunk = svcntw();
 
     // Multiple accumulators to increase instruction-level parallelism
     svfloat32_t sum0 = svdup_f32(0.0f);
@@ -62,7 +62,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
 
     // Handle partial chunk if needed
     if constexpr (partial_chunk) {
-        size_t remaining = dimension % sve_word_count;
+        size_t remaining = dimension % chunk;
         if (remaining > 0) {
             // Create predicate for the remaining elements
             svbool_t pg_partial =
@@ -93,24 +93,24 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi
     }
     // Handle remaining steps (0-3)
     if constexpr (additional_steps > 0) {
-        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
     }
     if constexpr (additional_steps > 1) {
-        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
     }
     if constexpr (additional_steps > 2) {
-        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
     }
 
     // Process 4 chunks at a time in the main loop
-    auto chunk_size = 4 * sve_word_count;
+    auto chunk_size = 4 * chunk;
     size_t number_of_chunks = dimension / chunk_size;
 
     for (size_t i = 0; i < number_of_chunks; i++) {
-        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec);
-        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec);
-        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec);
-        L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
+        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
+        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
+        L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk);
     }
 
     // Combine the accumulators

From 767e1904daae2d07bde95dad9cef780cb90f5809 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Tue, 27 May 2025 13:10:30 +0300
Subject: [PATCH 45/52] diff squared

---
 src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
index 2cff76a31..708807f98 100644
--- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
@@ -31,10 +31,9 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _
 
     // Calculate squared difference - simple and efficient approach
     __m256 diff = _mm256_sub_ps(v1, v2_dequant);
-    __m256 diff_squared = _mm256_mul_ps(diff, diff);
-
-    // Add to running sum
-    sum256 = _mm256_add_ps(sum256, diff_squared);
+    
+    // Use FMA for diff² + sum in one instruction
+    sum256 = _mm256_fmadd_ps(diff, diff, sum256);
 }
 
 template <unsigned char residual> // 0..15

From 44be2751efee926828c421cce5947f8fdda30e01 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Tue, 27 May 2025 13:19:41 +0300
Subject: [PATCH 46/52] format

---
 src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
index 708807f98..dfbbaa9e9 100644
--- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
@@ -31,7 +31,7 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _
 
     // Calculate squared difference - simple and efficient approach
     __m256 diff = _mm256_sub_ps(v1, v2_dequant);
-    
+
     // Use FMA for diff² + sum in one instruction
     sum256 = _mm256_fmadd_ps(diff, diff, sum256);
 }

From 3a956bfe49f228b8bad2de2f0196c24f9d0dd0dd Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Tue, 27 May 2025 13:55:59 +0300
Subject: [PATCH 47/52] chnage diff

---
 tests/unit/test_spaces.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index afdf7d01d..cdc5eb33a 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -421,7 +421,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) {
     params[2] = inv_norm;
 
     float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
-    ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance";
+    ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance";
 }
 TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
     // create a vector with extra space for the norm
@@ -474,7 +474,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
     params[2] = inv_norm;
 
     float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
-    ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance";
+    ASSERT_NEAR(dist, 0.0f, 0.00001f) << "SQ8_Cosine failed to match expected distance";
 }
 
 /* ======================== Test Getters ======================== */

From 5840e3fe6cd7805bb865f56531fa105cf0bd4bee Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 5 Jun 2025 17:31:27 +0300
Subject: [PATCH 48/52] Remove align from tests improve sse4

---
 src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 39 ++++++++++++++----------------
 src/VecSim/spaces/IP_space.cpp     | 16 ------------
 tests/unit/test_spaces.cpp         | 24 ------------------
 3 files changed, 18 insertions(+), 61 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
index 67ebc4547..1bad27610 100644
--- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
@@ -52,36 +52,33 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v,
         // Handle residual elements (1-3)
         if constexpr (residual % 4) {
             __m128 v1;
-            __m128 v2_dequant = _mm_setzero_ps();
+            __m128 v2_dequant;
 
             if constexpr (residual % 4 == 3) {
-                // Load 3 floats and set the last one to 0
-                v1 = _mm_load_ss(pVect1);                     // load 1 float, set the rest to 0
-                v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part
+                // Set 3 floats and the last one to 0
+                v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]);
 
-                // Dequantize first value
-                float dequant0 = quantized[0] * delta + min;
-                v2_dequant = _mm_load_ss(&dequant0);
-
-                // Dequantize next two values
-                float dequant_high[2] = {quantized[1] * delta + min, quantized[2] * delta + min};
-                v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high);
+                // Dequantize and set 3 values
+                v2_dequant = _mm_set_ps(0.0f,
+                                       quantized[2] * delta + min,
+                                       quantized[1] * delta + min,
+                                       quantized[0] * delta + min);
 
             } else if constexpr (residual % 4 == 2) {
-                // Load 2 floats and set the last two to 0
-                v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
+                // Set 2 floats and the last two to 0
+                v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]);
 
-                // Dequantize two values
-                float dequant_high[2] = {quantized[0] * delta + min, quantized[1] * delta + min};
-                v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high);
+                // Dequantize and set 2 values
+                v2_dequant = _mm_set_ps(0.0f, 0.0f,
+                                       quantized[1] * delta + min,
+                                       quantized[0] * delta + min);
 
             } else if constexpr (residual % 4 == 1) {
-                // Load 1 float and set the last three to 0
-                v1 = _mm_load_ss(pVect1);
+                // Set 1 float and the last three to 0
+                v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]);
 
-                // Dequantize one value
-                float dequant0 = quantized[0] * delta + min;
-                v2_dequant = _mm_load_ss(&dequant0);
+                // Dequantize and set 1 value
+                v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min);
             }
 
             pVect1 += residual % 4;
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 1bcd3a304..d24c1d142 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -70,29 +70,21 @@ dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons
     }
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vnni) {
-        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 16 * sizeof(float); // handles 16 floats
         return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
 #ifdef OPT_AVX2_FMA
     if (features.avx2 && features.fma3) {
-        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 16 * sizeof(float); // handles 16 floats
         return Choose_SQ8_IP_implementation_AVX2_FMA(dim);
     }
 #endif
 #ifdef OPT_AVX2
     if (features.avx2) {
-        if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 8 * sizeof(float); // handles 8 floats
         return Choose_SQ8_IP_implementation_AVX2(dim);
     }
 #endif
 #ifdef OPT_SSE4
     if (features.sse4_1) {
-        if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 4 * sizeof(float); // handles 4 floats
         return Choose_SQ8_IP_implementation_SSE4(dim);
     }
 #endif
@@ -136,29 +128,21 @@ dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
     }
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vnni) {
-        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 16 * sizeof(float); // handles 16 floats
         return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
 #ifdef OPT_AVX2_FMA
     if (features.avx2 && features.fma3) {
-        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 16 * sizeof(float); // handles 16 floats
         return Choose_SQ8_Cosine_implementation_AVX2_FMA(dim);
     }
 #endif
 #ifdef OPT_AVX2
     if (features.avx2) {
-        if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 8 * sizeof(float); // handles 8 floats
         return Choose_SQ8_Cosine_implementation_AVX2(dim);
     }
 #endif
 #ifdef OPT_SSE4
     if (features.sse4_1) {
-        if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 4 * sizeof(float); // handles 4 floats
         return Choose_SQ8_Cosine_implementation_SSE4(dim);
     }
 #endif
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index cdc5eb33a..dabe9c794 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -2267,7 +2267,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;
-        // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
 #endif
@@ -2279,7 +2278,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
-        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         optimization.fma3 = 0;
     }
 #endif
@@ -2291,7 +2289,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
-        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
         optimization.avx2 = 0;
     }
 #endif
@@ -2303,7 +2300,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SSE with dim " << dim;
-        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
         optimization.sse4_1 = 0;
     }
 #endif
@@ -2315,8 +2311,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
-        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
-        // Unset sve2 flag as well, so we'll choose the next option (default).
         optimization.sve2 = 0;
     }
 #endif
@@ -2328,8 +2322,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SVE with dim " << dim;
-        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
-        // Unset sve flag as well, so we'll choose the next option (default).
         optimization.sve = 0;
     }
 #endif
@@ -2341,8 +2333,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "NEON with dim " << dim;
-        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
-        // Unset optimizations flag, so we'll choose the next optimization.
         optimization.asimd = 0;
     }
 #endif
@@ -2396,8 +2386,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
-        // We don't align SQ8 vectors with cosine distance
-        // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim;
         optimization.sve2 = 0;
     }
 #endif
@@ -2409,8 +2397,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SVE with dim " << dim;
-        // We don't align SQ8 vectors with cosine distance
-        // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim;
         optimization.sve = 0;
     }
 #endif
@@ -2422,8 +2408,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "NEON with dim " << dim;
-        // We don't align SQ8 vectors with cosine distance
-        // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim;
         optimization.asimd = 0;
     }
 #endif
@@ -2437,8 +2421,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX512 with dim " << dim;
-        // We don't align SQ8 vectors with cosine distance
-        // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim;
         optimization.avx512f = 0;
     }
 #endif
@@ -2450,8 +2432,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
-        // We don't align SQ8 vectors with cosine distance
-        // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
         optimization.fma3 = 0;
     }
 #endif
@@ -2463,8 +2443,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "AVX with dim " << dim;
-        // We don't align SQ8 vectors with cosine distance
-        // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim;
         optimization.avx2 = 0;
     }
 #endif
@@ -2477,8 +2455,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
             << "SSE with dim " << dim;
-        // We don't align SQ8 vectors with cosine distance
-        // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim;
         optimization.sse4_1 = 0;
     }
 #endif

From 2a89dd8d7d9a2d696fc22198b1cec48d655c4096 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 5 Jun 2025 17:33:12 +0300
Subject: [PATCH 49/52] format

---
 src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
index 1bad27610..5e47af2b6 100644
--- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
@@ -59,19 +59,16 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v,
                 v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]);
 
                 // Dequantize and set 3 values
-                v2_dequant = _mm_set_ps(0.0f,
-                                       quantized[2] * delta + min,
-                                       quantized[1] * delta + min,
-                                       quantized[0] * delta + min);
+                v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min,
+                                        quantized[1] * delta + min, quantized[0] * delta + min);
 
             } else if constexpr (residual % 4 == 2) {
                 // Set 2 floats and the last two to 0
                 v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]);
 
                 // Dequantize and set 2 values
-                v2_dequant = _mm_set_ps(0.0f, 0.0f,
-                                       quantized[1] * delta + min,
-                                       quantized[0] * delta + min);
+                v2_dequant =
+                    _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min, quantized[0] * delta + min);
 
             } else if constexpr (residual % 4 == 1) {
                 // Set 1 float and the last three to 0

From e562a864aab30416817f7f04b683164320839f24 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 8 Jun 2025 18:09:21 +0300
Subject: [PATCH 50/52] applied to l2

---
 src/VecSim/spaces/L2/L2_SSE4_SQ8.h | 57 ++++++++++++++----------------
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
index 16b60286b..4b20ef351 100644
--- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
@@ -35,11 +35,11 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m12
 template <unsigned char residual> // 0..15
 float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const float *pVect1 = static_cast<const float *>(pVect1v);
-    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const uint8_t *quantized = static_cast<const uint8_t *>(pVect2v);
 
     // Get dequantization parameters from the end of quantized vector
-    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
-    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float min_val = *reinterpret_cast<const float *>(quantized + dimension);
+    const float delta = *reinterpret_cast<const float *>(quantized + dimension + sizeof(float));
 
     // Create broadcast vectors for SIMD operations
     __m128 min_val_vec = _mm_set1_ps(min_val);
@@ -54,40 +54,35 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime
         // Handle residual elements (1-3)
         if constexpr (residual % 4) {
             __m128 v1;
-            __m128 v2_dequant = _mm_setzero_ps();
+            __m128 v2_dequant;
 
             if constexpr (residual % 4 == 3) {
                 // Load 3 floats and set the last one to 0
-                v1 = _mm_load_ss(pVect1);                     // load 1 float, set the rest to 0
-                v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part
+                v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]);
 
-                // Dequantize first value
-                float dequant0 = pVect2[0] * delta + min_val;
-                v2_dequant = _mm_load_ss(&dequant0);
-
-                // Dequantize next two values
-                float dequant_high[2] = {pVect2[1] * delta + min_val, pVect2[2] * delta + min_val};
-                v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high);
+                // Dequantize and set 3 values
+                v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min_val,
+                                                  quantized[1] * delta + min_val, quantized[0] * delta + min_val);
 
             } else if constexpr (residual % 4 == 2) {
-                // Load 2 floats and set the last two to 0
-                v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1);
+                // Set 2 floats and the last two to 0
+                v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]);
 
-                // Dequantize two values
-                float dequant_high[2] = {pVect2[0] * delta + min_val, pVect2[1] * delta + min_val};
-                v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high);
+                // Dequantize and set 2 valuesAdd commentMore actions
+                v2_dequant = _mm_set_ps(0.0f, 0.0f,
+                                       quantized[1] * delta + min_val,
+                                       quantized[0] * delta + min_val);
 
             } else if constexpr (residual % 4 == 1) {
-                // Load 1 float and set the last three to 0
-                v1 = _mm_load_ss(pVect1);
+                // Set 1 float and the last three to 0Add commentMore actions
+                v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]);
 
-                // Dequantize one value
-                float dequant0 = pVect2[0] * delta + min_val;
-                v2_dequant = _mm_load_ss(&dequant0);
+                // Dequantize and set 1 value
+                v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min_val);
             }
 
             pVect1 += residual % 4;
-            pVect2 += residual % 4;
+            quantized += residual % 4;
 
             // Compute difference
             __m128 diff = _mm_sub_ps(v1, v2_dequant);
@@ -98,19 +93,19 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime
 
         // Process remaining blocks of 4 elements based on residual
         if constexpr (residual >= 12)
-            L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+            L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
         if constexpr (residual >= 8)
-            L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+            L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
         if constexpr (residual >= 4)
-            L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+            L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
     }
 
     // Process 16 elements at a time (4 elements per step, 4 steps)
     while (pVect1 < pEnd1) {
-        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
-        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
-        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
-        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
     }
 
     // TmpRes must be 16 bytes aligned

From 2a0b4e642424612d0ab74af0192e05209a398570 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 8 Jun 2025 18:28:38 +0300
Subject: [PATCH 51/52] format

---
 src/VecSim/spaces/L2/L2_SSE4_SQ8.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
index 4b20ef351..cd36a4e91 100644
--- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
+++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
@@ -61,17 +61,17 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime
                 v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]);
 
                 // Dequantize and set 3 values
-                v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min_val,
-                                                  quantized[1] * delta + min_val, quantized[0] * delta + min_val);
+                v2_dequant =
+                    _mm_set_ps(0.0f, quantized[2] * delta + min_val, quantized[1] * delta + min_val,
+                               quantized[0] * delta + min_val);
 
             } else if constexpr (residual % 4 == 2) {
                 // Set 2 floats and the last two to 0
                 v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]);
 
                 // Dequantize and set 2 valuesAdd commentMore actions
-                v2_dequant = _mm_set_ps(0.0f, 0.0f,
-                                       quantized[1] * delta + min_val,
-                                       quantized[0] * delta + min_val);
+                v2_dequant = _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min_val,
+                                        quantized[0] * delta + min_val);
 
             } else if constexpr (residual % 4 == 1) {
                 // Set 1 float and the last three to 0Add commentMore actions

From ab18690bc1426bf77fade3f0148af4831308fc75 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Sun, 8 Jun 2025 19:18:40 +0300
Subject: [PATCH 52/52] Remove alignment l2

---
 src/VecSim/spaces/L2_space.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 81f0df91d..ed920927d 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -70,29 +70,21 @@ dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons
     }
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vnni) {
-        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 16 * sizeof(float); // handles 16 floats
         return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
 #ifdef OPT_AVX2_FMA
     if (features.avx2 && features.fma3) {
-        if (dim % 16 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 16 * sizeof(float); // handles 16 floats
         return Choose_SQ8_L2_implementation_AVX2_FMA(dim);
     }
 #endif
 #ifdef OPT_AVX2
     if (features.avx2) {
-        if (dim % 8 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 8 * sizeof(float); // handles 8 floats
         return Choose_SQ8_L2_implementation_AVX2(dim);
     }
 #endif
 #ifdef OPT_SSE4
     if (features.sse4_1) {
-        if (dim % 4 == 0) // no point in aligning if we have an offsetting residual
-            *alignment = 4 * sizeof(float); // handles 4 floats
         return Choose_SQ8_L2_implementation_SSE4(dim);
     }
 #endif