Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix Rashba Hubbard segfault #327

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
19 changes: 0 additions & 19 deletions build-aux/frontier_rocm6_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,4 @@ cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
-GNinja \
..

#cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON \
-DFFTW_ROOT=$FFTW_PATH \
-DDCA_FIX_BROKEN_MPICH=ON \
-DROCM_ROOT=${ROCM_PATH} \
-DMAGMA_ROOT=${MAGMA_ROOT} \
-DLAPACK_ROOT=${OPENBLAS_ROOT} \
-DBLAS_ROOT=${OPENBLAS_ROOT} \
-DDCA_WITH_TESTS_FAST=ON \
-DTEST_RUNNER="srun" \
-DGPU_TARGETS=gfx90a \
-DAMDGPU_TARGETS=gfx90a \
-DCMAKE_C_COMPILER=mpicc \
-DCMAKE_CXX_COMPILER=mpic++ \
-DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ \
-DCMAKE_INSTALL_PREFIX=$INST \
-DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
-GNinja \
..
# cmake -DDCA_WITH_CUDA=off -DDCA_WITH_HIP=ON -DFFTW_ROOT=$FFTW_PATH -DDCA_FIX_BROKEN_MPICH=ON -DROCM_ROOT=${ROCM_PATH} -DMAGMA_ROOT=${MAGMA_ROOT} -DLAPACK_ROOT=${OPENBLAS_ROOT} -DBLAS_ROOT=${OPENBLAS_ROOT} -DDCA_WITH_TESTS_FAST=ON -DTEST_RUNNER="srun" -DGPU_TARGETS=gfx90a -DAMDGPU_TARGETS=gfx90a -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DCMAKE_HIP_COMPILER=/opt/rocm-6.0.0/llvm/bin/clang++ -DCMAKE_INSTALL_PREFIX=$INST -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" -GNinja ..
..
14 changes: 8 additions & 6 deletions build-aux/frontier_rocm6_load_modules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@

module reset
module load amd-mixed/6.0.0
spack load cmake%gcc@11.2.0
spack load ninja%gcc@11.2.0
spack load magma@master amdgpu_target=gfx90a
spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
spack load fftw ~mpi %rocmcc@6.0.0
spack load openblas@0.3.25 %gcc@11.2.0
module load ninja
module load cmake
#spack load cmake%gcc@11.2.0
#spack load ninja%gcc@11.2.0
#spack load magma@master amdgpu_target=gfx90a
#spack load hdf5@1.12.1 +cxx ~mpi api=v112 %rocmcc@6.0.0
#spack load fftw ~mpi %rocmcc@6.0.0
#spack load openblas@0.3.25 %gcc@11.2.0

export CC=mpicc
export CXX=mpicxx
Expand Down
19 changes: 14 additions & 5 deletions cmake/dca_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,10 @@ else()
endif()

# Lattice type
set(DCA_LATTICE "square" CACHE STRING "Lattice type, options are: bilayer | square | triangular |
Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn ")
set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer square triangular Kagome hund twoband_Cu threeband
Rashba_Hubbard Moire_Hubbard FeAs material_NiO material_FeSn)
set(DCA_LATTICE "square" CACHE STRING "Lattice type, options are: bilayer | complex_square | square | triangular |
Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | RealRashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn ")
set_property(CACHE DCA_LATTICE PROPERTY STRINGS bilayer complex_square square triangular Kagome hund twoband_Cu threeband
Rashba_Hubbard RealRashba_Hubbard Moire_Hubbard FeAs material_NiO material_FeSn)

if (DCA_LATTICE STREQUAL "bilayer")
set(DCA_LATTICE_TYPE dca::phys::models::bilayer_lattice<PointGroup>)
Expand All @@ -135,6 +135,11 @@ elseif (DCA_LATTICE STREQUAL "square")
set(DCA_LATTICE_INCLUDE
"dca/phys/models/analytic_hamiltonians/square_lattice.hpp")

elseif (DCA_LATTICE STREQUAL "complex_square")
set(DCA_LATTICE_TYPE dca::phys::models::complex_square_lattice<PointGroup>)
set(DCA_LATTICE_INCLUDE
"dca/phys/models/analytic_hamiltonians/complex_square_lattice.hpp")

elseif (DCA_LATTICE STREQUAL "triangular")
set(DCA_LATTICE_TYPE dca::phys::models::triangular_lattice<PointGroup>)
set(DCA_LATTICE_INCLUDE
Expand All @@ -153,6 +158,10 @@ elseif (DCA_LATTICE STREQUAL "Rashba_Hubbard")
set(DCA_LATTICE_TYPE dca::phys::models::RashbaHubbard<PointGroup>)
set(DCA_LATTICE_INCLUDE
"dca/phys/models/analytic_hamiltonians/rashba_hubbard.hpp")
elseif (DCA_LATTICE STREQUAL "RealRashba_Hubbard")
set(DCA_LATTICE_TYPE dca::phys::models::RealRashbaHubbard<PointGroup>)
set(DCA_LATTICE_INCLUDE
"dca/phys/models/analytic_hamiltonians/real_rashba_hubbard.hpp")
elseif (DCA_LATTICE STREQUAL "Moire_Hubbard")
set(DCA_LATTICE_TYPE dca::phys::models::moire_hubbard<PointGroup>)
set(DCA_LATTICE_INCLUDE
Expand Down Expand Up @@ -180,7 +189,7 @@ elseif (DCA_LATTICE STREQUAL "material_FeSn")
"dca/phys/models/material_hamiltonians/material_lattice.hpp")
set(DCA_MODEL_IS_MATERIAL_LATTICE ON CACHE BOOL "is the model a material lattice")
else()
message(FATAL_ERROR "Please set DCA_LATTICE to a valid option: bilayer | square | triangular | Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn.")
message(FATAL_ERROR "Please set DCA_LATTICE to a valid option: bilayer | complex_square | square | triangular | Kagome | hund | twoband_Cu | threeband | Rashba_Hubbard | RealRashba_Hubbard | Moire_Hubbard | FeAs | material_NiO | material_FeSn.")
endif()

# Model type
Expand Down
4 changes: 4 additions & 0 deletions cmake/dca_hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ if (CMAKE_HIP_COMPILER)
set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a")
set(CMAKE_HIP_STANDARD 17)
list(APPEND HIP_HIPCC_FLAGS "-fPIC")
list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics")
list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread")
list(APPEND HIP_HIPCC_FLAGS_DEBUG "--save-temps -g")

# doesn't appear to work
set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu)
message("Enabled HIP as a language")
Expand Down
49 changes: 48 additions & 1 deletion include/dca/linalg/util/atomic_add_cuda.cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) {
atomicAddImpl(address, val);
}

#elif defined(DCA_HAVE_HIP)
// HIP seems to have some horrible problem with concurrent atomic operations.
__device__ double inline atomicAddImpl(double* address, const double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
} while (assumed != old);
return __longlong_as_double(old);
}

__device__ double inline atomicAddImpl(float* address, const float val) {
unsigned long int* address_as_int = (unsigned long int*)address;
unsigned long int old = *address_as_int, assumed;
do {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val + __int_as_float(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
} while (assumed != old);
return __int_as_float(old);
}

__device__ void inline atomicAdd(float* address, const float val) {
atomicAddImpl(address, val);
}

__device__ void inline atomicAdd(double* address, const double val) {
atomicAddImpl(address, val);
}

__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
double* a_d = reinterpret_cast<double*>(address);
atomicAddImpl(a_d, val.x);
atomicAddImpl(a_d + 1, val.y);
}

__device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
double* a_d = reinterpret_cast<double*>(address);
atomicAddImpl(a_d, val.x);
atomicAddImpl(a_d + 1, val.y);
}

#else
__device__ void inline atomicAdd(double* address, double val) {
::atomicAdd(address, val);
Expand All @@ -62,8 +108,9 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
atomicAdd(a_d, val.x);
atomicAdd(a_d + 1, val.y);
}
#endif // __CUDA_ARCH__
#endif // atomic operation help


} // linalg
} // dca

Expand Down
1 change: 1 addition & 0 deletions include/dca/linalg/util/magma_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class MagmaQueue {
MagmaQueue& operator=(const MagmaQueue& rhs) = delete;

MagmaQueue(MagmaQueue&& rhs) noexcept : queue_(std::move(rhs.queue_)) {
std::swap(stream_, rhs.stream_);
std::swap(cublas_handle_, rhs.cublas_handle_);
std::swap(cusparse_handle_, rhs.cusparse_handle_);
std::swap(queue_, rhs.queue_);
Expand Down
2 changes: 2 additions & 0 deletions include/dca/math/nfft/dnfft_1d_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ void Dnfft1DGpu<Scalar, WDmn, RDmn, oversampling, CUBIC>::accumulate(
config_left_dev_.setAsync(config_left_, stream_);
times_dev_.setAsync(times_, stream_);

//hipStreamSynchronize(stream_.streamActually());

details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, Scalar, Real>(
M.ptr(), M.leadingDimension(), factor, accumulation_matrix_.ptr(),
accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,8 +406,8 @@ void CtauxAccumulator<device_t, Parameters, Data, DIST>::updateFrom(walker_type&
walker.get_error_distribution() = 0;
#endif // DCA_WITH_QMC_BIT

single_particle_accumulator_obj.syncStreams(*event);
two_particle_accumulator_.syncStreams(*event);
//single_particle_accumulator_obj.syncStreams(*event);
//two_particle_accumulator_.syncStreams(*event);
}

template <dca::linalg::DeviceType device_t, class Parameters, class Data, DistType DIST>
Expand Down
32 changes: 14 additions & 18 deletions include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -603,10 +603,6 @@ std::enable_if_t<dev_t == device_t && device_t != dca::linalg::CPU, void> CtauxW
read_Gamma_matrices(e_DN);

actually_download_from_device<device_t>();
// Gamma_up_CPU.setAsync(Gamma_up, thread_id, stream_id);
// Gamma_dn_CPU.setAsync(Gamma_dn, thread_id, stream_id);

// linalg::util::syncStream(thread_id, stream_id);
}

// In case Gamma_up and Gamma_down reside in the CPU memory, avoid the copies using swap.
Expand All @@ -625,9 +621,6 @@ std::enable_if_t<dev_t == device_t && device_t == dca::linalg::CPU, void> CtauxW
read_Gamma_matrices(e_DN);

actually_download_from_device<device_t>();

// Gamma_up_CPU.swap(Gamma_up);
// Gamma_dn_CPU.swap(Gamma_dn);
}

// In case Gamma_up and Gamma_down do not reside in the CPU memory, copy them.
Expand Down Expand Up @@ -1010,13 +1003,19 @@ void CtauxWalker<device_t, Parameters, Data>::read_Gamma_matrices(e_spin_states
// Profiler profiler(concurrency_, __FUNCTION__, "CT-AUX walker", __LINE__, thread_id);
switch (e_spin) {
case e_DN:
linalg::util::syncStream(thread_id, stream_id);
CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
Gamma_dn, N_dn, G_dn, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
// assume we've no guarantee this will be allowed to finish before the async copy starts
linalg::util::syncStream(thread_id, stream_id);
break;

case e_UP:
linalg::util::syncStream(thread_id, stream_id);
CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
Gamma_up, N_up, G_up, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
// assume we've no guarantee this will be allowed to finish before the async copy starts
linalg::util::syncStream(thread_id, stream_id);
break;

default:
Expand Down Expand Up @@ -1160,6 +1159,7 @@ void CtauxWalker<device_t, Parameters, Data>::remove_non_accepted_and_bennett_sp
}
}
}
linalg::util::syncStream(thread_id, stream_id);

assert(Gamma_up_size == Gamma_up_CPU.size().first and Gamma_up_size == Gamma_up_CPU.size().second);
assert(Gamma_dn_size == Gamma_dn_CPU.size().first and Gamma_dn_size == Gamma_dn_CPU.size().second);
Expand Down Expand Up @@ -1323,6 +1323,8 @@ void CtauxWalker<device_t, Parameters, Data>::add_delayed_spin(int& delayed_inde
}
*/

delayed_spins[delayed_index].is_accepted_move = false;

if (delayed_spins[delayed_index].e_spin_HS_field_DN == e_UP and
delayed_spins[delayed_index].e_spin_HS_field_UP == e_UP) {
Gamma_up_diag_max = tmp_up_diag_max < 1. ? 1. : tmp_up_diag_max;
Expand Down Expand Up @@ -1602,10 +1604,8 @@ void CtauxWalker<device_t, Parameters, Data>::updateShell(const int done, const
template <dca::linalg::DeviceType device_t, class Parameters, class Data>
template <typename AccumType>
const linalg::util::GpuEvent* CtauxWalker<device_t, Parameters, Data>::computeM(
std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
// Stream 1 waits on stream 0.
sync_streams_event_.record(linalg::util::getStream(thread_id, 0));
sync_streams_event_.block(linalg::util::getStream(thread_id, 1));
std::array<linalg::Matrix<AccumType, device_t>, 2>& Ms) {
linalg::util::syncStream(thread_id, stream_id);

for (int s = 0; s < 2; ++s) {
const auto& config = get_configuration().get(s == 0 ? e_DN : e_UP);
Expand All @@ -1620,18 +1620,14 @@ const linalg::util::GpuEvent* CtauxWalker<device_t, Parameters, Data>::computeM(
M.resizeNoCopy(N.size());

if (device_t == linalg::GPU) {
exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, s);
dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id, s);
exp_v_minus_one_dev_[s].setAsync(exp_v_minus_one_[s], thread_id, stream_id);
dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_dev_[s], N, M, thread_id, stream_id);
}
else {
dca::linalg::matrixop::multiplyDiagonalLeft(exp_v_minus_one_[s], N, M);
}
}

m_computed_events_[1].record(linalg::util::getStream(thread_id, 1));
m_computed_events_[1].block(linalg::util::getStream(thread_id, 0));

m_computed_events_[0].record(linalg::util::getStream(thread_id, 0));
m_computed_events_[0].record(linalg::util::getStream(thread_id, stream_id));
return &m_computed_events_[0];
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,12 @@ class CT_AUX_WALKER_TOOLS<dca::linalg::CPU, Scalar> {

private:
void solve_Gamma_slow(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
void solve_Gamma_slow(int n, Scalar* Gamma_LU, int lda);
void solve_Gamma_fast(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
void solve_Gamma_fast(int n, Scalar* A, int LD);
void solve_Gamma_BLAS(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);
void solve_Gamma_BLAS(int n, Scalar* Gamma_LU, int lda);

void solve_Gamma_fast(int n, Scalar* A, int LD);

void solve_Gamma_blocked(int n, dca::linalg::Matrix<Scalar, dca::linalg::CPU>& Gamma_LU);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters>::copy_rows(
assert(N_new_spins.nrCols() == N.nrCols());
assert(N_new_spins.nrRows() == permutation.size());
assert(permutation.size() <= identity.size());

dca::linalg::util::syncStream(thread_id, stream_id);
dca::linalg::matrixop::copyRows(N, permutation, N_new_spins, identity, thread_id, stream_id);
}

Expand All @@ -139,6 +139,7 @@ void N_MATRIX_TOOLS<dca::linalg::GPU, Parameters>::compute_G_cols(
dca::linalg::Matrix<Scalar, dca::linalg::GPU>& G,
dca::linalg::Matrix<Scalar, dca::linalg::GPU>& G_cols) {
exp_V.setAsync(exp_V_CPU, linalg::util::getStream(thread_id, stream_id));
dca::linalg::util::syncStream(thread_id, stream_id);

assert(N.nrRows() == G.nrRows());
assert(N.nrRows() == G_cols.nrRows());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class DMatrixBuilder<linalg::GPU, Scalar> final : public DMatrixBuilder<linalg::
// See DMatrixBuilder<linalg::CPU, Scalar>::computeG0.
// Out: G0. Device matrix
void computeG0(Matrix& G0, const details::DeviceConfiguration& configuration, int n_init,
bool right_section, GpuStream stream) const override;
bool right_section, const GpuStream& stream) const;

private:
const G0Interpolation<linalg::GPU, Scalar>& g0_ref_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,17 @@ namespace details {
template <typename Scalar, typename SignType>
void buildG0Matrix(linalg::MatrixView<Scalar, linalg::GPU> G0, const int n_init,
const bool right_section, DeviceConfiguration config,
DeviceInterpolationData<Scalar, SignType> g0_interp, dca::linalg::util::GpuStream stream);
DeviceInterpolationData<Scalar, SignType> g0_interp, const dca::linalg::util::GpuStream& stream);
extern template void buildG0Matrix(linalg::MatrixView<float, linalg::GPU>, const int, const bool,
DeviceConfiguration, DeviceInterpolationData<float, signed char>, const dca::linalg::util::GpuStream&);
extern template void buildG0Matrix(linalg::MatrixView<double, linalg::GPU>, const int, const bool,
DeviceConfiguration, DeviceInterpolationData<double, std::int8_t>, const dca::linalg::util::GpuStream&);
extern template void buildG0Matrix(linalg::MatrixView<std::complex<float>, linalg::GPU>, const int,
const bool, DeviceConfiguration,
DeviceInterpolationData<std::complex<float>, std::complex<float>>, const dca::linalg::util::GpuStream&);
extern template void buildG0Matrix(linalg::MatrixView<std::complex<double>, linalg::GPU>, const int,
const bool, DeviceConfiguration,
DeviceInterpolationData<std::complex<double>, std::complex<double>>, const dca::linalg::util::GpuStream&);

} // namespace details
} // namespace ctint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace details {
template <typename Scalar, typename Real>
void computeG0(linalg::MatrixView<Scalar, linalg::GPU>& g0_mat,
DeviceInterpolationData<Scalar, SignType<Scalar>> g0, const Real* t_l, const int* b_l,
const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream stream);
const int* r_lf, const Real* t_r, const int* b_r, const int* r_r, const dca::linalg::util::GpuStream& stream);



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class SpAccumulator<Parameters, linalg::GPU>
}

auto get_streams() {
return std::array<linalg::util::GpuStream*, 2>{&streams_[0], &streams_[1]};
return std::array<linalg::util::GpuStream*, 2>{&streams_[0], &streams_[0]};
}

// Returns the allocated device memory in bytes.
Expand All @@ -122,7 +122,7 @@ class SpAccumulator<Parameters, linalg::GPU>
*/
void finalizeFunction(std::array<NfftType, 2>& ft_objs, MFunction& function, bool m_sqr);

std::array<linalg::util::GpuStream, 2> streams_;
std::array<linalg::util::GpuStream, 1> streams_;
/** gpu M_r_t */
std::array<NfftType, 2> cached_nfft_obj_;
/** \todo Don't always pay the memory cost even when not collect single measurement G's */
Expand All @@ -137,9 +137,9 @@ SpAccumulator<Parameters, linalg::GPU>::SpAccumulator(const Parameters& paramete
: BaseClass(parameters_ref, accumulate_m_sqr),
streams_(),
cached_nfft_obj_{NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr),
NfftType(parameters_.get_beta(), streams_[1], accumulate_m_sqr)},
NfftType(parameters_.get_beta(), streams_[0], accumulate_m_sqr)},
single_measurement_M_r_t_device_{NfftType(parameters_.get_beta(), streams_[0], false),
NfftType(parameters_.get_beta(), streams_[1], false)} {
NfftType(parameters_.get_beta(), streams_[0], false)} {
single_measurement_M_r_w_.reset(new MFunction("M_r_w"));
}

Expand Down Expand Up @@ -189,7 +189,7 @@ void SpAccumulator<Parameters, linalg::GPU>::accumulate(
const std::array<Configuration, 2>& configs, const Scalar factor) {
std::array<linalg::Matrix<Scalar, linalg::GPU>, 2> M_dev;
for (int s = 0; s < 2; ++s)
M_dev[s].setAsync(Ms[s], streams_[s]);
M_dev[s].setAsync(Ms[s], streams_[0]);

accumulate(M_dev, configs, factor);
}
Expand Down
Loading
Loading