Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 27 additions & 27 deletions include/simsimd/dot.h
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f32_sve(simsimd_f32_t const *a_scalars, simsimd_
svbool_t pg_vec = svwhilelt_b32((unsigned int)idx_scalars, (unsigned int)count_scalars);
svfloat32_t a_vec = svld1_f32(pg_vec, a_scalars + idx_scalars);
svfloat32_t b_vec = svld1_f32(pg_vec, b_scalars + idx_scalars);
ab_vec = svmla_f32_x(pg_vec, ab_vec, a_vec, b_vec);
ab_vec = svmla_f32_m(pg_vec, ab_vec, a_vec, b_vec);
idx_scalars += svcntw();
} while (idx_scalars < count_scalars);
simsimd_distance_t reduced = svaddv_f32(svptrue_b32(), ab_vec);
Expand All @@ -670,10 +670,10 @@ SIMSIMD_PUBLIC void simsimd_dot_f32c_sve(simsimd_f32c_t const *a_pairs, simsimd_
svfloat32_t a_imag_vec = svget2_f32(a_vec, 1);
svfloat32_t b_real_vec = svget2_f32(b_vec, 0);
svfloat32_t b_imag_vec = svget2_f32(b_vec, 1);
ab_real_vec = svmla_f32_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f32_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f32_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f32_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
ab_real_vec = svmla_f32_m(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f32_m(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f32_m(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f32_m(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntw();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f32(svptrue_b32(), ab_real_vec);
Expand All @@ -694,10 +694,10 @@ SIMSIMD_PUBLIC void simsimd_vdot_f32c_sve(simsimd_f32c_t const *a_pairs, simsimd
svfloat32_t a_imag_vec = svget2_f32(a_vec, 1);
svfloat32_t b_real_vec = svget2_f32(b_vec, 0);
svfloat32_t b_imag_vec = svget2_f32(b_vec, 1);
ab_real_vec = svmla_f32_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f32_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f32_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f32_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
ab_real_vec = svmla_f32_m(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f32_m(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f32_m(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f32_m(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntw();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f32(svptrue_b32(), ab_real_vec);
Expand All @@ -713,7 +713,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f64_sve(simsimd_f64_t const *a_scalars, simsimd_
svbool_t pg_vec = svwhilelt_b64((unsigned int)idx_scalars, (unsigned int)count_scalars);
svfloat64_t a_vec = svld1_f64(pg_vec, a_scalars + idx_scalars);
svfloat64_t b_vec = svld1_f64(pg_vec, b_scalars + idx_scalars);
ab_vec = svmla_f64_x(pg_vec, ab_vec, a_vec, b_vec);
ab_vec = svmla_f64_m(pg_vec, ab_vec, a_vec, b_vec);
idx_scalars += svcntd();
} while (idx_scalars < count_scalars);
simsimd_distance_t reduced = svaddv_f64(svptrue_b32(), ab_vec);
Expand All @@ -734,10 +734,10 @@ SIMSIMD_PUBLIC void simsimd_dot_f64c_sve(simsimd_f64c_t const *a_pairs, simsimd_
svfloat64_t a_imag_vec = svget2_f64(a_vec, 1);
svfloat64_t b_real_vec = svget2_f64(b_vec, 0);
svfloat64_t b_imag_vec = svget2_f64(b_vec, 1);
ab_real_vec = svmla_f64_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f64_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f64_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f64_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
ab_real_vec = svmla_f64_m(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f64_m(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f64_m(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f64_m(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntd();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f64(svptrue_b64(), ab_real_vec);
Expand All @@ -758,10 +758,10 @@ SIMSIMD_PUBLIC void simsimd_vdot_f64c_sve(simsimd_f64c_t const *a_pairs, simsimd
svfloat64_t a_imag_vec = svget2_f64(a_vec, 1);
svfloat64_t b_real_vec = svget2_f64(b_vec, 0);
svfloat64_t b_imag_vec = svget2_f64(b_vec, 1);
ab_real_vec = svmla_f64_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f64_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f64_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f64_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
ab_real_vec = svmla_f64_m(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f64_m(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f64_m(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f64_m(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcntd();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f64(svptrue_b64(), ab_real_vec);
Expand All @@ -784,7 +784,7 @@ SIMSIMD_PUBLIC void simsimd_dot_f16_sve(simsimd_f16_t const *a_scalars, simsimd_
svbool_t pg_vec = svwhilelt_b16((unsigned int)idx_scalars, (unsigned int)count_scalars);
svfloat16_t a_vec = svld1_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(a_scalars + idx_scalars));
svfloat16_t b_vec = svld1_f16(pg_vec, (simsimd_f16_for_arm_simd_t const *)(b_scalars + idx_scalars));
ab_vec = svmla_f16_x(pg_vec, ab_vec, a_vec, b_vec);
ab_vec = svmla_f16_m(pg_vec, ab_vec, a_vec, b_vec);
idx_scalars += svcnth();
} while (idx_scalars < count_scalars);
simsimd_f16_for_arm_simd_t ab = svaddv_f16(svptrue_b16(), ab_vec);
Expand All @@ -805,10 +805,10 @@ SIMSIMD_PUBLIC void simsimd_dot_f16c_sve(simsimd_f16c_t const *a_pairs, simsimd_
svfloat16_t a_imag_vec = svget2_f16(a_vec, 1);
svfloat16_t b_real_vec = svget2_f16(b_vec, 0);
svfloat16_t b_imag_vec = svget2_f16(b_vec, 1);
ab_real_vec = svmla_f16_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f16_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f16_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f16_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
ab_real_vec = svmla_f16_m(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmls_f16_m(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f16_m(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmla_f16_m(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcnth();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f16(svptrue_b16(), ab_real_vec);
Expand All @@ -829,10 +829,10 @@ SIMSIMD_PUBLIC void simsimd_vdot_f16c_sve(simsimd_f16c_t const *a_pairs, simsimd
svfloat16_t a_imag_vec = svget2_f16(a_vec, 1);
svfloat16_t b_real_vec = svget2_f16(b_vec, 0);
svfloat16_t b_imag_vec = svget2_f16(b_vec, 1);
ab_real_vec = svmla_f16_x(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f16_x(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f16_x(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f16_x(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
ab_real_vec = svmla_f16_m(pg_vec, ab_real_vec, a_real_vec, b_real_vec);
ab_real_vec = svmla_f16_m(pg_vec, ab_real_vec, a_imag_vec, b_imag_vec);
ab_imag_vec = svmla_f16_m(pg_vec, ab_imag_vec, a_real_vec, b_imag_vec);
ab_imag_vec = svmls_f16_m(pg_vec, ab_imag_vec, a_imag_vec, b_real_vec);
idx_pairs += svcnth();
} while (idx_pairs < count_pairs);
results[0] = svaddv_f16(svptrue_b16(), ab_real_vec);
Expand Down
28 changes: 14 additions & 14 deletions include/simsimd/spatial.h
Original file line number Diff line number Diff line change
Expand Up @@ -805,7 +805,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f32_sve(simsimd_f32_t const *a, simsimd_f32_t c
svfloat32_t a_vec = svld1_f32(pg_vec, a + i);
svfloat32_t b_vec = svld1_f32(pg_vec, b + i);
svfloat32_t a_minus_b_vec = svsub_f32_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f32_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
d2_vec = svmla_f32_m(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcntw();
} while (i < n);
simsimd_f32_t d2 = svaddv_f32(svptrue_b32(), d2_vec);
Expand All @@ -823,9 +823,9 @@ SIMSIMD_PUBLIC void simsimd_cos_f32_sve(simsimd_f32_t const *a, simsimd_f32_t co
svbool_t pg_vec = svwhilelt_b32((unsigned int)i, (unsigned int)n);
svfloat32_t a_vec = svld1_f32(pg_vec, a + i);
svfloat32_t b_vec = svld1_f32(pg_vec, b + i);
ab_vec = svmla_f32_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f32_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f32_x(pg_vec, b2_vec, b_vec, b_vec);
ab_vec = svmla_f32_m(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f32_m(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f32_m(pg_vec, b2_vec, b_vec, b_vec);
i += svcntw();
} while (i < n);

Expand All @@ -852,7 +852,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f64_sve(simsimd_f64_t const *a, simsimd_f64_t c
svfloat64_t a_vec = svld1_f64(pg_vec, a + i);
svfloat64_t b_vec = svld1_f64(pg_vec, b + i);
svfloat64_t a_minus_b_vec = svsub_f64_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f64_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
d2_vec = svmla_f64_m(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcntd();
} while (i < n);
simsimd_f64_t d2 = svaddv_f64(svptrue_b32(), d2_vec);
Expand All @@ -870,9 +870,9 @@ SIMSIMD_PUBLIC void simsimd_cos_f64_sve(simsimd_f64_t const *a, simsimd_f64_t co
svbool_t pg_vec = svwhilelt_b64((unsigned int)i, (unsigned int)n);
svfloat64_t a_vec = svld1_f64(pg_vec, a + i);
svfloat64_t b_vec = svld1_f64(pg_vec, b + i);
ab_vec = svmla_f64_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f64_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f64_x(pg_vec, b2_vec, b_vec, b_vec);
ab_vec = svmla_f64_m(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f64_m(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f64_m(pg_vec, b2_vec, b_vec, b_vec);
i += svcntd();
} while (i < n);

Expand Down Expand Up @@ -910,7 +910,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_f16_sve(simsimd_f16_t const *a_enum, simsimd_f1
svfloat16_t a_vec = svld1_f16(pg_vec, a + i);
svfloat16_t b_vec = svld1_f16(pg_vec, b + i);
svfloat16_t a_minus_b_vec = svsub_f16_x(pg_vec, a_vec, b_vec);
d2_vec = svmla_f16_x(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
d2_vec = svmla_f16_m(pg_vec, d2_vec, a_minus_b_vec, a_minus_b_vec);
i += svcnth();
} while (i < n);
simsimd_f16_for_arm_simd_t d2_f16 = svaddv_f16(svptrue_b16(), d2_vec);
Expand All @@ -930,9 +930,9 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_sve(simsimd_f16_t const *a_enum, simsimd_f16
svbool_t pg_vec = svwhilelt_b16((unsigned int)i, (unsigned int)n);
svfloat16_t a_vec = svld1_f16(pg_vec, a + i);
svfloat16_t b_vec = svld1_f16(pg_vec, b + i);
ab_vec = svmla_f16_x(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f16_x(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f16_x(pg_vec, b2_vec, b_vec, b_vec);
ab_vec = svmla_f16_m(pg_vec, ab_vec, a_vec, b_vec);
a2_vec = svmla_f16_m(pg_vec, a2_vec, a_vec, a_vec);
b2_vec = svmla_f16_m(pg_vec, b2_vec, b_vec, b_vec);
i += svcnth();
} while (i < n);

Expand Down Expand Up @@ -981,8 +981,8 @@ SIMSIMD_PUBLIC void simsimd_l2sq_bf16_sve(simsimd_bf16_t const *a_enum, simsimd_

svfloat32_t a_minus_b_low_vec = svsub_f32_x(pg_low_vec, a_low_vec, b_low_vec);
svfloat32_t a_minus_b_high_vec = svsub_f32_x(pg_high_vec, a_high_vec, b_high_vec);
d2_low_vec = svmla_f32_x(pg_low_vec, d2_low_vec, a_minus_b_low_vec, a_minus_b_low_vec);
d2_high_vec = svmla_f32_x(pg_high_vec, d2_high_vec, a_minus_b_high_vec, a_minus_b_high_vec);
d2_low_vec = svmla_f32_m(pg_low_vec, d2_low_vec, a_minus_b_low_vec, a_minus_b_low_vec);
d2_high_vec = svmla_f32_m(pg_high_vec, d2_high_vec, a_minus_b_high_vec, a_minus_b_high_vec);
i += svcnth();
} while (i < n);
simsimd_f32_t d2_low = svaddv_f32(svptrue_b32(), d2_low_vec);
Expand Down