Optimize calculation of dot product for double vectors with AVX

This improves the performance with best models and should also make training faster. Signed-off-by: Stefan Weil <sw@weilnetz.de>
tesseract-ocr · Feb 20, 2019 · ef4d5b2 · ef4d5b2
1 parent b3bd23e
commit ef4d5b2
Showing 1 changed file with 23 additions and 63 deletions.
diff --git a/src/arch/dotproductavx.cpp b/src/arch/dotproductavx.cpp
@@ -29,70 +29,30 @@ namespace tesseract {
 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
 double DotProductAVX(const double* u, const double* v, int n) {
-  int max_offset = n - 4;
-  int offset = 0;
-  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
-  // v, and multiplying them together in parallel.
-  __m256d sum = _mm256_setzero_pd();
-  if (offset <= max_offset) {
-    offset = 4;
-    // Aligned load is reputedly faster but requires 32 byte aligned input.
-    if ((reinterpret_cast<uintptr_t>(u) & 31) == 0 &&
-        (reinterpret_cast<uintptr_t>(v) & 31) == 0) {
-      // Use aligned load.
-      __m256d floats1 = _mm256_load_pd(u);
-      __m256d floats2 = _mm256_load_pd(v);
-      // Multiply.
-      sum = _mm256_mul_pd(floats1, floats2);
-      while (offset <= max_offset) {
-        floats1 = _mm256_load_pd(u + offset);
-        floats2 = _mm256_load_pd(v + offset);
-        offset += 4;
-        __m256d product = _mm256_mul_pd(floats1, floats2);
-        sum = _mm256_add_pd(sum, product);
-      }
-    } else {
-      // Use unaligned load.
-      __m256d floats1 = _mm256_loadu_pd(u);
-      __m256d floats2 = _mm256_loadu_pd(v);
-      // Multiply.
-      sum = _mm256_mul_pd(floats1, floats2);
-      while (offset <= max_offset) {
-        floats1 = _mm256_loadu_pd(u + offset);
-        floats2 = _mm256_loadu_pd(v + offset);
-        offset += 4;
-        __m256d product = _mm256_mul_pd(floats1, floats2);
-        sum = _mm256_add_pd(sum, product);
-      }
-    }
+  const unsigned quot = n / 8;
+  const unsigned rem = n % 8;
+  __m256d t0 = _mm256_setzero_pd();
+  __m256d t1 = _mm256_setzero_pd();
+  for (unsigned k = 0; k < quot; k++) {
+    __m256d f0 = _mm256_loadu_pd(u);
+    __m256d f1 = _mm256_loadu_pd(v);
+    f0 = _mm256_mul_pd(f0, f1);
+    t0 = _mm256_add_pd(t0, f0);
+    u += 4;
+    v += 4;
+    __m256d f2 = _mm256_loadu_pd(u);
+    __m256d f3 = _mm256_loadu_pd(v);
+    f2 = _mm256_mul_pd(f2, f3);
+    t1 = _mm256_add_pd(t1, f2);
+    u += 4;
+    v += 4;
   }
-  // Add the 4 product sums together horizontally. Not so easy as with sse, as
-  // there is no add across the upper/lower 128 bit boundary, so permute to
-  // move the upper 128 bits to lower in another register.
-  __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
-  sum = _mm256_hadd_pd(sum, sum2);
-  sum = _mm256_hadd_pd(sum, sum);
-  double result;
-  // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
-  // instruction, as that introduces a 70 cycle delay. All this casting is to
-  // fool the intrinsics into thinking we are extracting the bottom int64.
-  auto cast_sum = _mm256_castpd_si256(sum);
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-  *(reinterpret_cast<int64_t*>(&result)) =
-#if defined(_WIN32) || defined(__i386__)
-      // This is a very simple workaround that is activated
-      // for all platforms that do not have _mm256_extract_epi64.
-      // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
-      ((uint64_t*)&cast_sum)[0]
-#else
-      _mm256_extract_epi64(cast_sum, 0)
-#endif
-      ;
-#pragma GCC diagnostic pop
-  while (offset < n) {
-    result += u[offset] * v[offset];
-    ++offset;
+  t0 = _mm256_hadd_pd(t0, t1);
+  alignas(32) double tmp[4];
+  _mm256_store_pd(tmp, t0);
+  double result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
+  for (unsigned k = 0; k < rem; k++) {
+    result += *u++ * *v++;
   }
   return result;
 }