Merge pull request #324 from antoniusf/fast-waves-fma-support

Improve CMake options relating to the PlaneWaveTurbulence optimization
CRPropa · Feb 23, 2021 · af45ff7 · af45ff7
2 parents 7a0a991 + 88b6960
commit af45ff7
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 30 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -36,18 +36,25 @@ if(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE)
   message(STATUS "Use --as-needed linker flags!")
 endif(CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE)
 
-SET(USE_SIMD OFF CACHE BOOL "Enable SIMD flags (SSE 1 through 4 + AVX) in GCC. This is necessary for fast plane wave turbulence.")
-if(USE_SIMD)
-  SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx" ) # TODO: use something like -march=native to automatically enable fma if available?
-endif(USE_SIMD)
+SET(SIMD_EXTENSIONS "none" CACHE STRING "Choose which of the SIMD instruction set extensions your target CPU supports. Possible values are \"native\" (use everything that's supported by the CPU you're building on), \"none\", \"avx\", and \"avx+fma\".")
+
+if(SIMD_EXTENSIONS STREQUAL "avx")
+  SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx" )
+elseif(SIMD_EXTENSIONS STREQUAL "avx+fma")
+  SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx -mfma" )
+elseif(SIMD_EXTENSIONS STREQUAL "native")
+  SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=native" )
+elseif(NOT SIMD_EXTENSIONS STREQUAL "none")
+  message(SEND_ERROR "SIMD_EXTENSIONS must have one of these values: \"native\", \"none\", \"avx\", or \"avx+fma\".")
+endif()
 
-SET(FAST_WAVES OFF CACHE BOOL "Enable SIMD optimizations for PlaneWaveTurbulence. Requires USE_SIMD to be set as well.")
+SET(FAST_WAVES OFF CACHE BOOL "Enable SIMD optimizations for PlaneWaveTurbulence. Requires that SIMD_EXTENSIONS is set to a compatible value (\"avx\", \"avx+fma\", or -- depending on the build CPU -- \"native\"). The build will fail if this option is set but SIMD_EXTENSIONS is not compatible, so you can adjust your settings.")
 if(FAST_WAVES)
-  if(USE_SIMD)
+  if(SIMD_EXTENSIONS STREQUAL "none")
+    message(SEND_ERROR "Enabling FAST_WAVES while SIMD_EXTENSIONS is set to \"none\" won't work, since SIMD is required for FAST_WAVES. Please try disabling FAST_WAVES, or try using another option for SIMD_EXTENSIONS.")
+  else()
     add_definitions(-DFAST_WAVES)
-  else(USE_SIMD)
-    message(SEND_ERROR "You've requested the FAST_WAVES implementation, but have not enabled USE_SIMD. PlaneWaveTurbulence will be compiled without the optimization.")
-  endif(USE_SIMD)
+  endif()
 endif(FAST_WAVES)
 
 # Add build type for profiling

diff --git a/include/crpropa/magneticField/turbulentField/PlaneWaveTurbulence.h b/include/crpropa/magneticField/turbulentField/PlaneWaveTurbulence.h
@@ -34,18 +34,56 @@ Furthermore, depending on the exact setting, the implementation is somewhat slow
 
  ## Using the SIMD optimization
  In order to mitigate some of the performance impact that is inherent in this
-method of field generation, an optimized version utilizing data-level
-parallelism through SIMD instructions is provided. More specifically, this
-implementation uses the AVX extension to the x86 instruction set. In order to
-use this optimized version, three conditions need to be met:
-
-1. The `USE_SIMD` option should be explicitly enabled in CMake. Currently,
-this sets GCC flags that tell the compiler to allow SIMD instructions.
-2. One also should enable the `FAST_WAVES` option to tell the compiler that you
-specifically want it to include the SIMD version of PlaneWaveTurbulence.
-3. Finally, the CPU that will actually run the code needs to support the
-required extensions: AVX and everything below. These extensions are relatively
-common, but there may still be processors in use that do not support them.
+method of field generation, an optimized version is provided. According to our
+tests (see the paper above), this version runs 20-30x faster than the baseline
+implementation and matches the speed of trilinear interpolation on a grid at
+a bit less than 100 wavemodes. To do this, it uses special CPU instructions
+called AVX, which are unfortunately not supported by every CPU. On Linux, you
+can check whether your CPU supports AVX by using the `lscpu` command, and
+searching the flags section for the string "avx". Alternatively, if you are
+building on the same CPU that CRPropa will run on, you can have the compiler
+detect this for you automatically (see below).
+
+ An additional speedup of about 1.33 can be achieved if the CPU supports the
+FMA extension in addition to AVX. Again, you can either check for this
+manually, or have the compiler figure it out for you.
+
+ **Note** that the optimized and non-optimized implementations to not return
+the exact same results. In fact, since the effective wave numbers used
+by the optimized implementation are very slightly different from those
+used by the non-optimized version (a difference smaller than the precision
+of a double, but nevertheless relevant at some point), the wavemodes go
+out of phase for large distances from the origin, and the fields are no longer
+comparable at all.
+
+ ### If you are building on the same machine that the code will run on:
+
+1. In cmake: enable the FAST_WAVES flag.
+2. Also in cmake: set SIMD_EXTENSIONS to "native". The compiler will automatically
+detect support for your CPU and run the build with the appropriate settings.
+3. Generate files and exit cmake, then build.
+3. If your CPU does not support the necessary extensions, the build will fail
+with an error telling you so. In this case, you won't be able to use the optimization;
+go back into cmake, disable FAST_WAVES, and build again.
+4. If the build runs through without errors, the code is built with the optimization.
+
+ ### If you are building on a different machine from the one where the code will run:
+
+1. Figure out which extensions your target machine supports, using `lscpu | grep avx`
+for AVX, and `lscpu | grep fma` for FMA. Run these commands on your target machine,
+not on your build machine. If your target machine does not run Linux, you may have to
+use different commands.
+2. If the CPU does not support AVX, you can stop here, and build normally. The CPU
+does not support the necessary extensions, and will not run code that is compiled
+using them.
+3. If the CPU does support AVX, continue. In cmake, enable the FAST_WAVES flag.
+4. Also in cmake, set SIMD_EXTENSIONS to the appropriate value. If your target CPU
+supports only AVX, but not FMA, set it to "avx". If it supports both, set it to
+"avx+fma".
+5. Generate and exit cmake, then build.
+6. If you made a mistake and used the wrong flags, you should see an illegal
+instruction error when trying to import CRPropa, or (at the latest) when trying
+to call `getField`.
 
 [GJ99]: https://doi.org/10.1086/307452
 [TD13]: https://doi.org/10.1063/1.4789861

diff --git a/src/magneticField/turbulentField/PlaneWaveTurbulence.cpp b/src/magneticField/turbulentField/PlaneWaveTurbulence.cpp
@@ -48,12 +48,21 @@
 
 #include <iostream>
 
-#ifdef FAST_WAVES
+#if defined(FAST_WAVES)
+#if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__) && defined(__SSE4_2__) && defined(__AVX__)
+#define ENABLE_FAST_WAVES
+#else
+#error "FAST_WAVES is enabled, but it appears that not all required SIMD extensions are enabled in the compiler. Without these extensions, the FAST_WAVES optimization cannot be used. Please make sure that the SIMD_EXTENSIONS option in cmake matches the capabilities of your target CPU, or (if your target CPU does not support the required extensions), disable the FAST_WAVES flag in cmake."
+#endif
+#endif
+
+
+#ifdef ENABLE_FAST_WAVES
 #include <immintrin.h>
 #endif
 
 namespace crpropa {
-#ifdef FAST_WAVES
+#ifdef ENABLE_FAST_WAVES
 // see
 // https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx
 double hsum_double_avx(__m256d v) {
@@ -90,13 +99,13 @@ float hsum_float_avx(__m256 x) {
 	const __m128 sum = _mm_add_ss(lo, hi);
 	return _mm_cvtss_f32(sum);
 }
-#endif // defined(FAST_WAVES)
+#endif // defined(ENABLE_FAST_WAVES)
 
 PlaneWaveTurbulence::PlaneWaveTurbulence(const TurbulenceSpectrum &spectrum,
                                          int Nm, int seed)
     : TurbulentField(spectrum), Nm(Nm) {
 
-#ifdef FAST_WAVES
+#ifdef ENABLE_FAST_WAVES
 	KISS_LOG_INFO << "PlaneWaveTurbulence: Using SIMD TD13 implementation"
 	              << std::endl;
 
@@ -188,7 +197,7 @@ PlaneWaveTurbulence::PlaneWaveTurbulence(const TurbulenceSpectrum &spectrum,
 		Ak[i] = sqrt(2 * Ak[i] / Ak2_sum) * spectrum.getBrms();
 	}
 
-#ifdef FAST_WAVES
+#ifdef ENABLE_FAST_WAVES
 	// * copy data into AVX-compatible arrays *
 	// AVX requires all data to be aligned to 256 bit, or 32 bytes, which is the
 	// same as 4 double precision floating point numbers. Since support for
@@ -234,20 +243,20 @@ PlaneWaveTurbulence::PlaneWaveTurbulence(const TurbulenceSpectrum &spectrum,
 		// as well
 		avx_data[i + align_offset + avx_Nm * ibeta] = beta[i] / M_PI;
 	}
-#endif // FAST_WAVES
+#endif // ENABLE_FAST_WAVES
 }
 
 Vector3d PlaneWaveTurbulence::getField(const Vector3d &pos) const {
 
-#ifndef FAST_WAVES
+#ifndef ENABLE_FAST_WAVES
 	Vector3d B(0.);
 	for (int i = 0; i < Nm; i++) {
 		double z_ = pos.dot(kappa[i]);
 		B += xi[i] * Ak[i] * cos(k[i] * z_ + beta[i]);
 	}
 	return B;
 
-#else  // FAST_WAVES
+#else  // ENABLE_FAST_WAVES
 
 	// Initialize accumulators
 	//
@@ -392,7 +401,7 @@ Vector3d PlaneWaveTurbulence::getField(const Vector3d &pos) const {
 
 	return Vector3d(hsum_double_avx(acc0), hsum_double_avx(acc1),
 	                hsum_double_avx(acc2));
-#endif // FAST_WAVES
+#endif // ENABLE_FAST_WAVES
 }
 
 } // namespace crpropa