CompFUSE · PDoakORNL · Apr 1, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/build-aux/frontier_rocm6_build.sh b/build-aux/frontier_rocm6_build.sh
@@ -18,23 +18,4 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
       -GNinja \
       ..
 
-#cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
-      -DFFTW_ROOT=$FFTW_PATH \
-      -DDCA_FIX_BROKEN_MPICH=ON \
-      -DROCM_ROOT=${ROCM_PATH} \
-      -DMAGMA_ROOT=${MAGMA_ROOT} \
-      -DLAPACK_ROOT=${OPENBLAS_ROOT} \
-      -DBLAS_ROOT=${OPENBLAS_ROOT} \
-      -DDCA_WITH_TESTS_FAST=ON \
-      -DTEST_RUNNER="srun" \
-      -DGPU_TARGETS=gfx90a \
-      -DAMDGPU_TARGETS=gfx90a \
-      -DCMAKE_C_COMPILER=mpicc \
-      -DCMAKE_CXX_COMPILER=mpic++ \
-      -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
-      -DCMAKE_INSTALL_PREFIX=$INST \
-      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-      -GNinja \
-      ..
 # cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON -DFFTW_ROOT=$FFTW_PATH -DDCA_FIX_BROKEN_MPICH=ON -DROCM_ROOT=${ROCM_PATH} -DMAGMA_ROOT=${MAGMA_ROOT} -DLAPACK_ROOT=${OPENBLAS_ROOT} -DBLAS_ROOT=${OPENBLAS_ROOT} -DDCA_WITH_TESTS_FAST=ON -DTEST_RUNNER="srun" -DGPU_TARGETS=gfx90a -DAMDGPU_TARGETS=gfx90a -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ -DCMAKE_INSTALL_PREFIX=$INST -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" -GNinja ..
-      ..
diff --git a/build-aux/frontier_rocm6_load_modules.sh b/build-aux/frontier_rocm6_load_modules.sh
@@ -9,12 +9,14 @@
 
 module reset
 module load amd-mixed/6.0.0
-spack load cmake%gcc@11.2.0
-spack load ninja%gcc@11.2.0
-spack load magma@master amdgpu_target=gfx90a
-spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
-spack load fftw ~mpi %rocmcc@6.0.0
-spack load openblas@0.3.25 %gcc@11.2.0
+module load ninja
+module load cmake
+#spack load cmake%gcc@11.2.0
+#spack load ninja%gcc@11.2.0
+#spack load magma@master amdgpu_target=gfx90a
+#spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
+#spack load fftw ~mpi %rocmcc@6.0.0
+#spack load openblas@0.3.25 %gcc@11.2.0
 
 export CC=mpicc
 export CXX=mpicxx

diff --git a/cmake/dca_config.cmake b/cmake/dca_config.cmake
@@ -120,10 +120,10 @@ else()
 endif()
 
 # Lattice type
-set(DCA_LATTICE "square" CACHE STRING "Lattice type, options are: bilayer | square | triangular |
-    Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn ")
-set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer square triangular Kagome hund twoband_Cu threeband
-             Rashba_Hubbard Moire_Hubbard FeAs material_NiO material_FeSn)
+set(DCA_LATTICE "square" CACHE STRING "Lattice type, options are: bilayer | complex_square | square | triangular |
+    Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | RealRashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn ")
+set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer complex_square square triangular Kagome hund twoband_Cu threeband
+             Rashba_Hubbard RealRashba_Hubbard Moire_Hubbard FeAs material_NiO material_FeSn)
 
 if (DCA_LATTICE STREQUAL "bilayer")
   set(DCA_LATTICE_TYPE dca::phys::models::bilayer_lattice<PointGroup>)
@@ -135,6 +135,11 @@ elseif (DCA_LATTICE STREQUAL "square")
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/square_lattice.hpp")
 
+elseif (DCA_LATTICE STREQUAL "complex_square")
+  set(DCA_LATTICE_TYPE dca::phys::models::complex_square_lattice<PointGroup>)
+  set(DCA_LATTICE_INCLUDE
+    "dca/phys/models/analytic_hamiltonians/complex_square_lattice.hpp")
+
 elseif (DCA_LATTICE STREQUAL "triangular")
   set(DCA_LATTICE_TYPE dca::phys::models::triangular_lattice<PointGroup>)
   set(DCA_LATTICE_INCLUDE
@@ -153,6 +158,10 @@ elseif (DCA_LATTICE STREQUAL "Rashba_Hubbard")
   set(DCA_LATTICE_TYPE dca::phys::models::RashbaHubbard<PointGroup>)
   set(DCA_LATTICE_INCLUDE
     "dca/phys/models/analytic_hamiltonians/rashba_hubbard.hpp")
+elseif (DCA_LATTICE STREQUAL "RealRashba_Hubbard")
+  set(DCA_LATTICE_TYPE dca::phys::models::RealRashbaHubbard<PointGroup>)
+  set(DCA_LATTICE_INCLUDE
+    "dca/phys/models/analytic_hamiltonians/real_rashba_hubbard.hpp")
 elseif (DCA_LATTICE STREQUAL "Moire_Hubbard")
   set(DCA_LATTICE_TYPE dca::phys::models::moire_hubbard<PointGroup>)
   set(DCA_LATTICE_INCLUDE
@@ -180,7 +189,7 @@ elseif (DCA_LATTICE STREQUAL "material_FeSn")
       "dca/phys/models/material_hamiltonians/material_lattice.hpp")
   set(DCA_MODEL_IS_MATERIAL_LATTICE ON CACHE BOOL "is the model a material lattice")
 else()
-  message(FATAL_ERROR "Please set DCA_LATTICE to a valid option: bilayer | square | triangular | Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn.")
+  message(FATAL_ERROR "Please set DCA_LATTICE to a valid option: bilayer | complex_square | square | triangular | Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | RealRashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn.")
 endif()
 
 # Model type

diff --git a/cmake/dca_hip.cmake b/cmake/dca_hip.cmake
@@ -74,6 +74,10 @@ if (CMAKE_HIP_COMPILER)
   set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a")
   set(CMAKE_HIP_STANDARD 17)
   list(APPEND HIP_HIPCC_FLAGS "-fPIC")
+  list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics")
+  list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread")
+  list(APPEND HIP_HIPCC_FLAGS_DEBUG "--save-temps -g")
+
   # doesn't appear to work
   set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu)
   message("Enabled HIP as a language")

diff --git a/include/dca/linalg/util/atomic_add_cuda.cu.hpp b/include/dca/linalg/util/atomic_add_cuda.cu.hpp
@@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) {
   atomicAddImpl(address, val);
 }
 
+#elif defined(DCA_HAVE_HIP)
+// HIP seems to have some horrible problem with concurrent atomic operations.
+__device__ double inline atomicAddImpl(double* address, const double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+__device__ double inline atomicAddImpl(float* address, const float val) {
+  unsigned long int* address_as_int = (unsigned long int*)address;
+  unsigned long int old = *address_as_int, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_int, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+
+__device__ void inline atomicAdd(float* address, const float val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(double* address, const double val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
+  __device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
 #else
 __device__ void inline atomicAdd(double* address, double val) {
   ::atomicAdd(address, val);
@@ -62,8 +108,9 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
   atomicAdd(a_d, val.x);
   atomicAdd(a_d + 1, val.y);
 }
-#endif  // __CUDA_ARCH__
+#endif  // atomic operation help
 
+
 }  // linalg
 }  // dca
 

diff --git a/include/dca/linalg/util/magma_queue.hpp b/include/dca/linalg/util/magma_queue.hpp
@@ -42,6 +42,7 @@ class MagmaQueue {
   MagmaQueue& operator=(const MagmaQueue& rhs) = delete;
 
   MagmaQueue(MagmaQueue&& rhs) noexcept : queue_(std::move(rhs.queue_)) {
+    std::swap(stream_, rhs.stream_);
     std::swap(cublas_handle_, rhs.cublas_handle_);
     std::swap(cusparse_handle_, rhs.cusparse_handle_);
     std::swap(queue_, rhs.queue_);

diff --git a/include/dca/math/nfft/dnfft_1d_gpu.hpp b/include/dca/math/nfft/dnfft_1d_gpu.hpp
@@ -224,6 +224,8 @@ void Dnfft1DGpu<Scalar, WDmn, RDmn, oversampling, CUBIC>::accumulate(
   config_left_dev_.setAsync(config_left_, stream_);
   times_dev_.setAsync(times_, stream_);
 
+  //hipStreamSynchronize(stream_.streamActually());
+
   details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, Scalar, Real>(
       M.ptr(), M.leadingDimension(), factor, accumulation_matrix_.ptr(),
       accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),

diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_accumulator.hpp
@@ -406,8 +406,8 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST>::updateFrom(walker_type&
   walker.get_error_distribution() = 0;
 #endif  // DCA_WITH_QMC_BIT
 
-  single_particle_accumulator_obj.syncStreams(*event);
-  two_particle_accumulator_.syncStreams(*event);
+  //single_particle_accumulator_obj.syncStreams(*event);
+  //two_particle_accumulator_.syncStreams(*event);
 }
 
 template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST>

diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
@@ -603,10 +603,6 @@ std::enable_if_t<dev_t == device_t && device_t != dca::linalg::CPU, void> CtauxW
   read_Gamma_matrices(e_DN);
 
   actually_download_from_device<device_t>();
-  // Gamma_up_CPU.setAsync(Gamma_up, thread_id, stream_id);
-  // Gamma_dn_CPU.setAsync(Gamma_dn, thread_id, stream_id);
-
-  // linalg::util::syncStream(thread_id, stream_id);
 }
 
 // In case Gamma_up and Gamma_down reside in the CPU memory, avoid the copies using swap.
@@ -625,9 +621,6 @@ std::enable_if_t<dev_t == device_t && device_t == dca::linalg::CPU, void> CtauxW
   read_Gamma_matrices(e_DN);
 
   actually_download_from_device<device_t>();
-
-  // Gamma_up_CPU.swap(Gamma_up);
-  // Gamma_dn_CPU.swap(Gamma_dn);
 }
 
 // In case Gamma_up and Gamma_down do not reside in the CPU memory, copy them.
@@ -1010,13 +1003,19 @@ void CtauxWalker<device_t, Parameters, Data>::read_Gamma_matrices(e_spin_states
   // Profiler profiler(concurrency_, __FUNCTION__, "CT-AUX walker", __LINE__, thread_id);
   switch (e_spin) {
     case e_DN:
+      linalg::util::syncStream(thread_id, stream_id);
       CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
           Gamma_dn, N_dn, G_dn, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
+      // assume we've no guarantee this will be allowed to finish before the async copy starts
+      linalg::util::syncStream(thread_id, stream_id);
       break;
 
     case e_UP:
+      linalg::util::syncStream(thread_id, stream_id);
       CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
           Gamma_up, N_up, G_up, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
+      // assume we've no guarantee this will be allowed to finish before the async copy starts
+      linalg::util::syncStream(thread_id, stream_id);
       break;
 
     default:
@@ -1160,6 +1159,7 @@ void CtauxWalker<device_t, Parameters, Data>::remove_non_accepted_and_bennett_sp
       }
     }
   }
+  linalg::util::syncStream(thread_id, stream_id);
 
   assert(Gamma_up_size == Gamma_up_CPU.size().first and Gamma_up_size == Gamma_up_CPU.size().second);
   assert(Gamma_dn_size == Gamma_dn_CPU.size().first and Gamma_dn_size == Gamma_dn_CPU.size().second);
@@ -1323,6 +1323,8 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
       }
     */
 
+    delayed_spins[delayed_index].is_accepted_move  = false;
+
     if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP and
         delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) {
       Gamma_up_diag_max = tmp_up_diag_max < 1. ? 1. : tmp_up_diag_max;
@@ -1602,10 +1604,8 @@ void CtauxWalker<device_t, Parameters, Data>::updateShell(const int done, const
 template <dca::linalg::DeviceType device_t, class Parameters, class Data>
 template <typename AccumType>
 const linalg::util::GpuEvent* CtauxWalker<device_t, Parameters, Data>::computeM(
-    std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
-  // Stream 1 waits on stream 0.
-  sync_streams_event_.record(linalg::util::getStream(thread_id, 0));
-  sync_streams_event_.block(linalg::util::getStream(thread_id, 1));
+  std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
+  linalg::util::syncStream(thread_id, stream_id);
 
   for (int s = 0; s < 2; ++s) {
     const auto& config = get_configuration().get(s == 0 ? e_DN : e_UP);
@@ -1620,18 +1620,14 @@ const linalg::util::GpuEvent* CtauxWalker<device_t, Parameters, Data>::computeM(
     M.resizeNoCopy(N.size());
 
     if (device_t == linalg::GPU) {
-      exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, s);
-      dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id, s);
+      exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, stream_id);
+      dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id, stream_id);
     }
     else {
       dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_[s], N, M);
     }
   }
-
-  m_computed_events_[1].record(linalg::util::getStream(thread_id, 1));
-  m_computed_events_[1].block(linalg::util::getStream(thread_id, 0));
-
-  m_computed_events_[0].record(linalg::util::getStream(thread_id, 0));
+  m_computed_events_[0].record(linalg::util::getStream(thread_id, stream_id));
   return &m_computed_events_[0];
 }
 

diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/walker/ct_aux_walker_tools.hpp
@@ -90,10 +90,12 @@ class CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar> {
 
 private:
   void solve_Gamma_slow(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
+  void solve_Gamma_slow(int n, Scalar* Gamma_LU, int lda);
   void solve_Gamma_fast(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
+  void solve_Gamma_fast(int n, Scalar* A, int LD);
   void solve_Gamma_BLAS(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
+  void solve_Gamma_BLAS(int n, Scalar* Gamma_LU, int lda);
 
-  void solve_Gamma_fast(int n, Scalar* A, int LD);
 
   void solve_Gamma_blocked(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
 

diff --git a/...dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc b/...dca/phys/dca_step/cluster_solver/ctaux/walker/tools/n_matrix_tools/n_matrix_tools_gpu.inc
@@ -129,7 +129,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters>::copy_rows(
   assert(N_new_spins.nrCols() == N.nrCols());
   assert(N_new_spins.nrRows() == permutation.size());
   assert(permutation.size() <= identity.size());
-
+  dca::linalg::util::syncStream(thread_id, stream_id);
   dca::linalg::matrixop::copyRows(N, permutation, N_new_spins, identity, thread_id, stream_id);
 }
 
@@ -139,6 +139,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters>::compute_G_cols(
     dca::linalg::Matrix<Scalar, dca::linalg::GPU>& G,
     dca::linalg::Matrix<Scalar, dca::linalg::GPU>& G_cols) {
   exp_V.setAsync(exp_V_CPU, linalg::util::getStream(thread_id, stream_id));
+  dca::linalg::util::syncStream(thread_id, stream_id);
 
   assert(N.nrRows() == G.nrRows());
   assert(N.nrRows() == G_cols.nrRows());

diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/d_matrix_builder_gpu.hpp
@@ -53,7 +53,7 @@ class DMatrixBuilder<linalg::GPU, Scalar> final : public DMatrixBuilder<linalg::
   // See DMatrixBuilder<linalg::CPU, Scalar>::computeG0.
   // Out: G0. Device matrix
   void computeG0(Matrix& G0, const details::DeviceConfiguration& configuration, int n_init,
-                 bool right_section, GpuStream stream) const override;
+                 bool right_section, const GpuStream& stream) const;
 
 private:
   const G0Interpolation<linalg::GPU, Scalar>& g0_ref_;

diff --git a/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp b/include/dca/phys/dca_step/cluster_solver/ctint/walker/tools/kernels_interface.hpp
@@ -30,7 +30,17 @@ namespace details {
 template <typename Scalar, typename SignType>
 void buildG0Matrix(linalg::MatrixView<Scalar, linalg::GPU> G0, const int n_init,
                    const bool right_section, DeviceConfiguration config,
-                   DeviceInterpolationData<Scalar, SignType> g0_interp, dca::linalg::util::GpuStream stream);
+                   DeviceInterpolationData<Scalar, SignType> g0_interp, const dca::linalg::util::GpuStream& stream);
+extern  template void buildG0Matrix(linalg::MatrixView<float, linalg::GPU>, const int, const bool,
+                            DeviceConfiguration, DeviceInterpolationData<float, signed char>, const dca::linalg::util::GpuStream&);
+extern  template void buildG0Matrix(linalg::MatrixView<double, linalg::GPU>, const int, const bool,
+                            DeviceConfiguration, DeviceInterpolationData<double, std::int8_t>, const dca::linalg::util::GpuStream&);
+extern  template void buildG0Matrix(linalg::MatrixView<std::complex<float>, linalg::GPU>, const int,
+                            const bool, DeviceConfiguration,
+                            DeviceInterpolationData<std::complex<float>, std::complex<float>>, const dca::linalg::util::GpuStream&);
+extern  template void buildG0Matrix(linalg::MatrixView<std::complex<double>, linalg::GPU>, const int,
+                            const bool, DeviceConfiguration,
+                            DeviceInterpolationData<std::complex<double>, std::complex<double>>, const dca::linalg::util::GpuStream&);
 
 }  // namespace details
 }  // namespace ctint

diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/kernels_interface.hpp
@@ -30,7 +30,7 @@ namespace details {
 template <typename Scalar, typename Real>
 void computeG0(linalg::MatrixView<Scalar, linalg::GPU>& g0_mat,
                DeviceInterpolationData<Scalar, SignType<Scalar>> g0, const Real* t_l, const int* b_l,
-               const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream stream);
+               const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream& stream);
 
 
 

diff --git a/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp b/include/dca/phys/dca_step/cluster_solver/shared_tools/accumulation/sp/sp_accumulator_gpu.hpp
@@ -96,7 +96,7 @@ class SpAccumulator<Parameters, linalg::GPU>
   }
 
   auto get_streams() {
-    return std::array<linalg::util::GpuStream*, 2>{&streams_[0], &streams_[1]};
+    return std::array<linalg::util::GpuStream*, 2>{&streams_[0], &streams_[0]};
   }
 
   // Returns the allocated device memory in bytes.
@@ -122,7 +122,7 @@ class SpAccumulator<Parameters, linalg::GPU>
    */
   void finalizeFunction(std::array<NfftType, 2>& ft_objs, MFunction& function, bool m_sqr);
 
-  std::array<linalg::util::GpuStream, 2> streams_;
+  std::array<linalg::util::GpuStream, 1> streams_;
   /** gpu M_r_t */
   std::array<NfftType, 2> cached_nfft_obj_;
   /** \todo Don't always pay the memory cost even when not collect single measurement G's */
@@ -137,9 +137,9 @@ SpAccumulator<Parameters, linalg::GPU>::SpAccumulator(const Parameters& paramete
     : BaseClass(parameters_ref, accumulate_m_sqr),
       streams_(),
       cached_nfft_obj_{NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr),
-                       NfftType(parameters_.get_beta(), streams_[1], accumulate_m_sqr)},
+                       NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr)},
       single_measurement_M_r_t_device_{NfftType(parameters_.get_beta(), streams_[0], false),
-                                       NfftType(parameters_.get_beta(), streams_[1], false)} {
+                                       NfftType(parameters_.get_beta(), streams_[0], false)} {
   single_measurement_M_r_w_.reset(new MFunction("M_r_w"));
 }
 
@@ -189,7 +189,7 @@ void SpAccumulator<Parameters, linalg::GPU>::accumulate(
     const std::array<Configuration, 2>& configs, const Scalar factor) {
   std::array<linalg::Matrix<Scalar, linalg::GPU>, 2> M_dev;
   for (int s = 0; s < 2; ++s)
-    M_dev[s].setAsync(Ms[s], streams_[s]);
+    M_dev[s].setAsync(Ms[s], streams_[0]);
 
   accumulate(M_dev, configs, factor);
 }