Skip to content

[0.6] Beng arm opt f64 - [MOD-9077] #648

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 9, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/VecSim/spaces/IP/IP_NEON_FP64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/

#include "VecSim/spaces/space_includes.h"
#include <arm_neon.h>

inline void InnerProductStep(double *&pVect1, double *&pVect2, float64x2_t &sum) {
float64x2_t v1 = vld1q_f64(pVect1);
float64x2_t v2 = vld1q_f64(pVect2);
sum = vmlaq_f64(sum, v1, v2);
pVect1 += 2;
pVect2 += 2;
}

template <unsigned char residual> // 0..7
double FP64_InnerProductSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
double *pVect1 = (double *)pVect1v;
double *pVect2 = (double *)pVect2v;

float64x2_t sum0 = vdupq_n_f64(0.0);
float64x2_t sum1 = vdupq_n_f64(0.0);
float64x2_t sum2 = vdupq_n_f64(0.0);
float64x2_t sum3 = vdupq_n_f64(0.0);

const size_t num_of_chunks = dimension / 8;

for (size_t i = 0; i < num_of_chunks; i++) {
InnerProductStep(pVect1, pVect2, sum0);
InnerProductStep(pVect1, pVect2, sum1);
InnerProductStep(pVect1, pVect2, sum2);
InnerProductStep(pVect1, pVect2, sum3);
}

// Handle remaining complete 2-float blocks within residual
constexpr size_t remaining_chunks = residual / 2;
// Unrolled loop for the 2-float blocks
if constexpr (remaining_chunks >= 1) {
InnerProductStep(pVect1, pVect2, sum0);
}
if constexpr (remaining_chunks >= 2) {
InnerProductStep(pVect1, pVect2, sum1);
}
if constexpr (remaining_chunks >= 3) {
InnerProductStep(pVect1, pVect2, sum2);
}

// Handle final residual elements (0-1 elements)
// This entire block is eliminated at compile time if final_residual is 0
constexpr size_t final_residual = residual % 2; // Final 0-1 elements
if constexpr (final_residual == 1) {
float64x2_t v1 = vdupq_n_f64(0.0);
float64x2_t v2 = vdupq_n_f64(0.0);
v1 = vld1q_lane_f64(pVect1, v1, 0);
v2 = vld1q_lane_f64(pVect2, v2, 0);

sum3 = vmlaq_f64(sum3, v1, v2);
}

float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3));

// Horizontal sum of the 4 elements in the NEON register
float64x1_t summed = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined));
double sum = vget_lane_f64(summed, 0);

return 1.0 - sum;
}
73 changes: 73 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE_FP64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/

#include "VecSim/spaces/space_includes.h"

#include <arm_sve.h>

inline void InnerProductStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum,
const size_t chunk) {
// Load vectors
svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset);
svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset);

// Multiply-accumulate
sum = svmla_f64_x(svptrue_b64(), sum, v1, v2);

// Advance pointers
offset += chunk;
}

template <bool partial_chunk, unsigned char additional_steps>
double FP64_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
double *pVect1 = (double *)pVect1v;
double *pVect2 = (double *)pVect2v;
const size_t chunk = svcntd();
size_t offset = 0;

// Multiple accumulators to increase instruction-level parallelism
svfloat64_t sum0 = svdup_f64(0.0);
svfloat64_t sum1 = svdup_f64(0.0);
svfloat64_t sum2 = svdup_f64(0.0);
svfloat64_t sum3 = svdup_f64(0.0);

auto chunk_size = 4 * chunk;
size_t number_of_chunks = dimension / chunk_size;
for (size_t i = 0; i < number_of_chunks; i++) {
InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
InnerProductStep(pVect1, pVect2, offset, sum3, chunk);
}

if constexpr (additional_steps >= 1) {
InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
}
if constexpr (additional_steps >= 2) {
InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
}
if constexpr (additional_steps >= 3) {
InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
}

if constexpr (partial_chunk) {
svbool_t pg =
svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));
svfloat64_t v1 = svld1_f64(pg, pVect1 + offset);
svfloat64_t v2 = svld1_f64(pg, pVect2 + offset);
sum3 = svmla_f64_m(pg, sum3, v1, v2);
}

// Combine the partial sums
sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1);
sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3);

// Perform vector addition in parallel
svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2);
// Single horizontal reduction at the end
double result = svaddv_f64(svptrue_b64(), sum_all);
return 1.0 - result;
}
23 changes: 22 additions & 1 deletion src/VecSim/spaces/IP_space.cpp
Original file line number Diff line number Diff line change
@@ -91,8 +91,10 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
#ifdef CPU_FEATURES_ARCH_X86_64

CalculationGuideline optimization_type = FP64_GetCalculationGuideline(dim);
#endif

switch (arch_opt) {
#ifdef CPU_FEATURES_ARCH_X86_64
case ARCH_OPT_AVX512_DQ:
#ifdef OPT_AVX512DQ
{
@@ -140,10 +142,29 @@ dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
ret_dist_func = dist_funcs[optimization_type];
} break;
#endif
#endif // __x86_64__ */
#ifdef CPU_FEATURES_ARCH_AARCH64
case ARCH_OPT_SVE2:
#ifdef OPT_SVE2
ret_dist_func = Choose_FP64_IP_implementation_SVE2(dim);
break;

#endif
case ARCH_OPT_SVE:
#ifdef OPT_SVE
ret_dist_func = Choose_FP64_IP_implementation_SVE(dim);
break;

#endif
case ARCH_OPT_NEON:
#ifdef OPT_NEON
ret_dist_func = Choose_FP64_IP_implementation_NEON(dim);
break;
#endif
#endif // CPU_FEATURES_ARCH_AARCH64
case ARCH_OPT_NONE:
break;
} // switch
#endif // __x86_64__ */
return ret_dist_func;
}

76 changes: 76 additions & 0 deletions src/VecSim/spaces/L2/L2_NEON_FP64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/

#include "VecSim/spaces/space_includes.h"
#include <arm_neon.h>

inline void L2SquareStep(double *&pVect1, double *&pVect2, float64x2_t &sum) {
float64x2_t v1 = vld1q_f64(pVect1);
float64x2_t v2 = vld1q_f64(pVect2);

// Calculate difference between vectors
float64x2_t diff = vsubq_f64(v1, v2);

// Square and accumulate
sum = vmlaq_f64(sum, diff, diff);

pVect1 += 2;
pVect2 += 2;
}

template <unsigned char residual> // 0..7
double FP64_L2SqrSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
double *pVect1 = (double *)pVect1v;
double *pVect2 = (double *)pVect2v;

float64x2_t sum0 = vdupq_n_f64(0.0);
float64x2_t sum1 = vdupq_n_f64(0.0);
float64x2_t sum2 = vdupq_n_f64(0.0);
float64x2_t sum3 = vdupq_n_f64(0.0);
// These are compile-time constants derived from the template parameter

// Calculate how many full 8-element blocks to process
const size_t num_of_chunks = dimension / 8;

for (size_t i = 0; i < num_of_chunks; i++) {
L2SquareStep(pVect1, pVect2, sum0);
L2SquareStep(pVect1, pVect2, sum1);
L2SquareStep(pVect1, pVect2, sum2);
L2SquareStep(pVect1, pVect2, sum3);
}

// Handle remaining complete 2-float blocks within residual
constexpr size_t remaining_chunks = residual / 2;
// Unrolled loop for the 2-float blocks
if constexpr (remaining_chunks >= 1) {
L2SquareStep(pVect1, pVect2, sum0);
}
if constexpr (remaining_chunks >= 2) {
L2SquareStep(pVect1, pVect2, sum1);
}
if constexpr (remaining_chunks >= 3) {
L2SquareStep(pVect1, pVect2, sum2);
}

// Handle final residual element
constexpr size_t final_residual = residual % 2; // Final element
if constexpr (final_residual > 0) {
float64x2_t v1 = vdupq_n_f64(0.0);
float64x2_t v2 = vdupq_n_f64(0.0);
v1 = vld1q_lane_f64(pVect1, v1, 0);
v2 = vld1q_lane_f64(pVect2, v2, 0);

// Calculate difference and square
float64x2_t diff = vsubq_f64(v1, v2);
sum3 = vmlaq_f64(sum3, diff, diff);
}

float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3));

// Horizontal sum of the 4 elements in the NEON register
float64x1_t sum = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined));
return vget_lane_f64(sum, 0);
}
81 changes: 81 additions & 0 deletions src/VecSim/spaces/L2/L2_SVE_FP64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/

#include "VecSim/spaces/space_includes.h"
#include <arm_sve.h>

inline void L2SquareStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum,
const size_t chunk) {
// Load vectors
svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset);
svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset);

// Calculate difference between vectors
svfloat64_t diff = svsub_f64_x(svptrue_b64(), v1, v2);

// Square the difference and accumulate: sum += diff * diff
sum = svmla_f64_x(svptrue_b64(), sum, diff, diff);

// Advance pointers by the vector length
offset += chunk;
}

template <bool partial_chunk, unsigned char additional_steps>
double FP64_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
double *pVect1 = (double *)pVect1v;
double *pVect2 = (double *)pVect2v;
const size_t chunk = svcntd();
size_t offset = 0;

// Multiple accumulators to increase instruction-level parallelism
svfloat64_t sum0 = svdup_f64(0.0);
svfloat64_t sum1 = svdup_f64(0.0);
svfloat64_t sum2 = svdup_f64(0.0);
svfloat64_t sum3 = svdup_f64(0.0);

// Process vectors in chunks, with unrolling for better pipelining
auto chunk_size = 4 * chunk;
size_t number_of_chunks = dimension / chunk_size;
for (size_t i = 0; i < number_of_chunks; ++i) {
// Process 4 chunks with separate accumulators
L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
L2SquareStep(pVect1, pVect2, offset, sum3, chunk);
}

if constexpr (additional_steps >= 1) {
L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
}
if constexpr (additional_steps >= 2) {
L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
}
if constexpr (additional_steps >= 3) {
L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
}

if constexpr (partial_chunk) {
svbool_t pg =
svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension));

// Load vectors with predication
svfloat64_t v1 = svld1_f64(pg, pVect1 + offset);
svfloat64_t v2 = svld1_f64(pg, pVect2 + offset);

// Calculate difference with predication (corrected)
svfloat64_t diff = svsub_f64_x(pg, v1, v2);

// Square the difference and accumulate with predication
sum3 = svmla_f64_m(pg, sum3, diff, diff);
}

// Combine the partial sums
sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1);
sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3);
svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2);
double result = svaddv_f64(svptrue_b64(), sum_all);
return result;
}
24 changes: 21 additions & 3 deletions src/VecSim/spaces/L2_space.cpp
Original file line number Diff line number Diff line change
@@ -93,10 +93,11 @@ dist_func_t<double> L2_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch

dist_func_t<double> ret_dist_func = FP64_L2Sqr;
#ifdef CPU_FEATURES_ARCH_X86_64

CalculationGuideline optimization_type = FP64_GetCalculationGuideline(dim);
#endif

switch (arch_opt) {
#ifdef CPU_FEATURES_ARCH_X86_64
case ARCH_OPT_AVX512_DQ:
#ifdef OPT_AVX512DQ
{
@@ -143,11 +144,28 @@ dist_func_t<double> L2_FP64_GetDistFunc(size_t dim, const Arch_Optimization arch
ret_dist_func = dist_funcs[optimization_type];
} break;
#endif
#endif // __x86_64__

#ifdef CPU_FEATURES_ARCH_AARCH64
case ARCH_OPT_SVE2:
#ifdef OPT_SVE2
ret_dist_func = Choose_FP64_L2_implementation_SVE2(dim);
break;
#endif
case ARCH_OPT_SVE:
#ifdef OPT_SVE
ret_dist_func = Choose_FP64_L2_implementation_SVE(dim);
break;
#endif
case ARCH_OPT_NEON:
#ifdef OPT_NEON
ret_dist_func = Choose_FP64_L2_implementation_NEON(dim);
break;
#endif
#endif // __aarch64__
case ARCH_OPT_NONE:
break;
} // switch

#endif // __x86_64__ */
return ret_dist_func;
}

Loading
Oops, something went wrong.
Loading
Oops, something went wrong.