diff --git a/Src/AmrCore/AMReX_TagBox.H b/Src/AmrCore/AMReX_TagBox.H index f6adb04577b..2053dd4998c 100644 --- a/Src/AmrCore/AMReX_TagBox.H +++ b/Src/AmrCore/AMReX_TagBox.H @@ -62,41 +62,19 @@ public: /** * \brief Mark neighbors of every tagged cell a distance nbuff away - * only search interior for initial tagged points where nwid - * is given as the width of the bndry region. * * \param nbuff - * \param nwid */ - void buffer (const IntVect& nbuf, const IntVect& nwid) noexcept; - - /** - * \brief Add location of every tagged cell to IntVect array, - * starting at given location. Returns the number of - * collated points. - * - * \param ar - * \param start - */ - Long collate (Vector& ar, int start) const noexcept; - - /** - * \brief Returns number of tagged cells in specified Box. - * - * \param bx - */ - Long numTags (const Box& bx) const noexcept; - - /** - * \brief Returns total number of tagged cells in the TagBox. - */ - Long numTags () const noexcept; + void buffer (const IntVect& nbuf) noexcept; /** * \brief Returns Vector\ of size domain.numPts() suitable for calling * Fortran, with positions set to same value as in the TagBox * dataPtr(). */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif Vector tags () const noexcept; /** @@ -107,6 +85,9 @@ public: * \param ar * \param tilebx */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void get_itags(Vector& ar, const Box& tilebx) const noexcept; /** @@ -115,6 +96,9 @@ public: * * \param ar */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags (const Vector& ar) noexcept; /** @@ -123,6 +107,9 @@ public: * * \param ar */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags_and_untags (const Vector& ar) noexcept; /** @@ -132,6 +119,9 @@ public: * \param ar * \param tilebx */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags (const Vector& ar, const Box& tilebx) noexcept; /** @@ -141,6 +131,9 @@ public: * \param ar * \param tilebx */ +//#if (__cplusplus >= 201402L) +// [[deprecated("No need to use this unless calling Fortran < 2003")]] +//#endif void tags_and_untags (const Vector& ar, const Box& tilebx) noexcept; }; @@ -225,17 +218,17 @@ public: */ void coarsen (const IntVect& ratio); - /** - * \brief The total number of Tags in all the contained TagBoxes. - */ - Long numTags () const; - /** * \brief Calls collate() on all contained TagBoxes. * * \param TheGlobalCollateSpace */ void collate (Vector& TheGlobalCollateSpace) const; + + void local_collate_cpu (Vector& v) const; +#ifdef AMREX_USE_GPU + void local_collate_gpu (Vector& v) const; +#endif }; } diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp index 550b47e66a6..2ac3163e0cc 100644 --- a/Src/AmrCore/AMReX_TagBox.cpp +++ b/Src/AmrCore/AMReX_TagBox.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace amrex { @@ -20,15 +21,11 @@ TagBox::TagBox (Arena* ar) noexcept TagBox::TagBox (const Box& bx, int n, Arena* ar) : BaseFab(bx,n,ar) -{ - setVal(TagBox::CLEAR); -} +{} TagBox::TagBox (const Box& bx, int n, bool alloc, bool shared, Arena* ar) : BaseFab(bx,n,alloc,shared,ar) -{ - if (alloc) setVal(TagBox::CLEAR); -} +{} TagBox::TagBox (const TagBox& rhs, MakeType make_type, int scomp, int ncomp) : BaseFab(rhs,make_type,scomp,ncomp) @@ -37,146 +34,74 @@ TagBox::TagBox (const TagBox& rhs, MakeType make_type, int scomp, int ncomp) void TagBox::coarsen (const IntVect& ratio, const Box& cbox) noexcept { - // xxxxx TODO: gpu - BL_ASSERT(nComp() == 1); Array4 const& farr = this->const_array(); - TagBox cfab(cbox, 1, The_Cpu_Arena()); + TagBox cfab(cbox, 1, The_Arena()); + Elixir eli = cfab.elixir(); Array4 const& carr = cfab.array(); - const auto flo = amrex::lbound(domain); - const auto fhi = amrex::ubound(domain); + Box fdomain = domain; Dim3 r{1,1,1}; AMREX_D_TERM(r.x = ratio[0];, r.y = ratio[1];, r.z = ratio[2]); - for (int k = flo.z; k <= fhi.z; ++k) { - int kc = amrex::coarsen(k,r.z); - for (int j = flo.y; j <= fhi.y; ++j) { - int jc = amrex::coarsen(j,r.y); - for (int i = flo.x; i <= fhi.x; ++i) { - int ic = amrex::coarsen(i,r.x); - carr(ic,jc,kc) = carr(ic,jc,kc) || farr(i,j,k); - } - } - } - - std::memcpy(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts()); - this->domain = cbox; -} - -void -TagBox::buffer (const IntVect& nbuff, const IntVect& nwid) noexcept -{ - // - // Note: this routine assumes cell with TagBox::SET tag are in - // interior of tagbox (region = grow(domain,-nwid)). - // - Box inside(domain); - inside.grow(-nwid); - const int* inlo = inside.loVect(); - const int* inhi = inside.hiVect(); - - int klo = 0, khi = 0, jlo = 0, jhi = 0, ilo, ihi; - AMREX_D_TERM(ilo=inlo[0]; ihi=inhi[0]; , - jlo=inlo[1]; jhi=inhi[1]; , - klo=inlo[2]; khi=inhi[2];) - - int ni = 0, nj = 0, nk = 0; - AMREX_D_TERM(ni=nbuff[0];, nj=nbuff[1];, nk=nbuff[2];) - - IntVect d_length = domain.size(); - const int* len = d_length.getVect(); - const int* lo = domain.loVect(); - TagType* d = dataPtr(); - - amrex::ignore_unused(len); -#define OFF(i,j,k,lo,len) AMREX_D_TERM(i-lo[0], +(j-lo[1])*len[0] , +(k-lo[2])*len[0]*len[1]) - - for (int k = klo; k <= khi; k++) + AMREX_HOST_DEVICE_FOR_3D(cbox, i, j, k, { - for (int j = jlo; j <= jhi; j++) - { - for (int i = ilo; i <= ihi; i++) - { - TagType* d_check = d + OFF(i,j,k,lo,len); - if (*d_check == TagBox::SET) - { - for (int kk = -nk; kk <= nk; kk++) - { - for (int jj = -nj; jj <= nj; jj++) - { - for (int ii = -ni; ii <= ni; ii++) - { - TagType* dn = d_check+ AMREX_D_TERM(ii, +jj*len[0], +kk*len[0]*len[1]); - if (*dn !=TagBox::SET) - *dn = TagBox::BUF; - } - } + TagType t = TagBox::CLEAR; + for (int koff = 0; koff < r.z; ++koff) { + int kk = k*r.z + koff; + for (int joff = 0; joff < r.y; ++joff) { + int jj = j*r.y + joff; + for (int ioff = 0; ioff < r.x; ++ioff) { + int ii = i*r.x + ioff; + if (fdomain.contains(IntVect(AMREX_D_DECL(ii,jj,kk)))) { + t = t || farr(ii,jj,kk); } } } } - } -#undef OFF -} + carr(i,j,k) = t; + }); -Long -TagBox::numTags () const noexcept -{ - Long nt = 0L; - Long len = domain.numPts(); - const TagType* d = dataPtr(); - for (Long n = 0; n < len; ++n) +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + Gpu::dtod_memcpy_async(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts()); + } else +#endif { - if (d[n] != TagBox::CLEAR) - ++nt; + std::memcpy(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts()); } - return nt; -} - -Long -TagBox::numTags (const Box& b) const noexcept -{ - TagBox tempTagBox(b,1); - tempTagBox.copy(*this); - return tempTagBox.numTags(); + this->domain = cbox; } -Long -TagBox::collate (Vector& ar, int start) const noexcept +void +TagBox::buffer (const IntVect& a_nbuff) noexcept { - BL_ASSERT(start >= 0); - // - // Starting at given offset of array ar, enter location (IntVect) of - // each tagged cell in tagbox. - // - Long count = 0; - IntVect d_length = domain.size(); - const int* len = d_length.getVect(); - const int* lo = domain.loVect(); - const TagType* d = dataPtr(); - int ni = 1, nj = 1, nk = 1; - AMREX_D_TERM(ni = len[0]; , nj = len[1]; , nk = len[2];) - - for (int k = 0; k < nk; k++) + Array4 const& a = this->array(); + Dim3 nbuf = a_nbuff.dim3(); + const auto lo = amrex::lbound(domain); + const auto hi = amrex::ubound(domain); + AMREX_HOST_DEVICE_FOR_3D(domain, i, j, k, { - for (int j = 0; j < nj; j++) - { - for (int i = 0; i < ni; i++) - { - const TagType* dn = d + AMREX_D_TERM(i, +j*len[0], +k*len[0]*len[1]); - if (*dn != TagBox::CLEAR) - { - ar[start++] = IntVect(AMREX_D_DECL(lo[0]+i,lo[1]+j,lo[2]+k)); - count++; - } - } + if (a(i,j,k) == TagBox::CLEAR) { + bool to_buf = false; + int imin = amrex::max(i-nbuf.x, lo.x); + int jmin = amrex::max(j-nbuf.y, lo.y); + int kmin = amrex::max(k-nbuf.z, lo.z); + int imax = amrex::min(i+nbuf.x, hi.x); + int jmax = amrex::min(j+nbuf.y, hi.y); + int kmax = amrex::min(k+nbuf.z, hi.z); + for (int kk = kmin; kk <= kmax && !to_buf; ++kk) { + for (int jj = jmin; jj <= jmax && !to_buf; ++jj) { + for (int ii = imin; ii <= imax && !to_buf; ++ii) { + if (a(ii,jj,kk) == TagBox::SET) to_buf = true; + }}} + if (to_buf) a(i,j,k) = TagBox::BUF; } - } - return count; + }); } +// DEPRECATED Vector TagBox::tags () const noexcept { @@ -194,7 +119,7 @@ TagBox::tags () const noexcept return ar; } - +// DEPRECATED // Set values as specified by the array -- this only tags. // It's an error if ar.length() != domain.numPts(). void @@ -212,6 +137,7 @@ TagBox::tags (const Vector& ar) noexcept } } +// DEPRECATED // Set values as specified by the array -- this tags and untags. // It's an error if ar.length() != domain.numPts(). void @@ -229,6 +155,7 @@ TagBox::tags_and_untags (const Vector& ar) noexcept } } +// DEPRECATED // Since a TagBox is a BaseFab, we can use this utility // function to allocate an integer array to have the same number // of elements as cells in tilebx @@ -271,6 +198,7 @@ TagBox::get_itags(Vector& ar, const Box& tilebx) const noexcept } } +// DEPRECATED // Set values as specified by the array -- this only tags. // only changes values in the tilebx region void @@ -304,6 +232,7 @@ TagBox::tags (const Vector& ar, const Box& tilebx) noexcept } } +// DEPRECATED // Set values as specified by the array -- this tags and untags. // only changes values in the tilebx region void @@ -343,7 +272,7 @@ TagBoxArray::TagBoxArray (const BoxArray& ba, : FabArray(ba,dm,1,_ngrow,MFInfo(),DefaultFabFactory()) { - if (SharedMemory()) setVal(TagBox::CLEAR); + setVal(TagBox::CLEAR); } TagBoxArray::TagBoxArray (const BoxArray& ba, @@ -352,23 +281,22 @@ TagBoxArray::TagBoxArray (const BoxArray& ba, : FabArray(ba,dm,1,_ngrow,MFInfo(),DefaultFabFactory()) { - if (SharedMemory()) setVal(TagBox::CLEAR); + setVal(TagBox::CLEAR); } void TagBoxArray::buffer (const IntVect& nbuf) { - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu - AMREX_ASSERT(nbuf.allLE(n_grow)); if (nbuf.max() > 0) { #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel if (Gpu::notInLaunchRegion()) #endif - for (MFIter mfi(*this); mfi.isValid(); ++mfi) - get(mfi).buffer(nbuf, n_grow); + for (MFIter mfi(*this); mfi.isValid(); ++mfi) { + get(mfi).buffer(nbuf); + } } } @@ -377,80 +305,294 @@ TagBoxArray::mapPeriodicRemoveDuplicates (const Geometry& geom) { BL_PROFILE("TagBoxArray::mapPRD"); - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu + if (Gpu::inLaunchRegion()) + { + // There is not atomicAdd for char. So we have to use int. + iMultiFab itag = amrex::cast(*this); + iMultiFab tmp(boxArray(),DistributionMap(),1,nGrowVect()); + tmp.setVal(0); + tmp.ParallelAdd(itag, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity()); + + // We need to keep tags in periodic boundary + const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect()); +#ifdef _OPENMP +#pragma omp parallel +#endif + for (MFIter mfi(tmp); mfi.isValid(); ++mfi) { + Box const& box = mfi.fabbox(); + Array4 const& tag =this->array(mfi); + Array4 const& tmptag = tmp.const_array(mfi); + Array4 const& msk = owner_mask->const_array(mfi); + amrex::ParallelFor(box, + [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + if (msk(i,j,k)) { + tag(i,j,k) = static_cast(tmptag(i,j,k)); + } else { + tag(i,j,k) = TagBox::CLEAR; + } + }); + } + } + else + { + TagBoxArray tmp(boxArray(),DistributionMap(),nGrowVect()); // note that tmp is filled w/ CLEAR. + tmp.ParallelAdd(*this, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity()); - TagBoxArray tmp(boxArray(),DistributionMap(),nGrowVect()); // note that tmp is filled w/ CLEAR. + // We need to keep tags in periodic boundary + const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect()); +#ifdef _OPENMP +#pragma omp parallel +#endif + for (MFIter mfi(tmp); mfi.isValid(); ++mfi) { + Box const& box = mfi.fabbox(); + Array4 const& tag = tmp.array(mfi); + Array4 const& msk = owner_mask->const_array(mfi); + AMREX_LOOP_3D(box, i, j, k, + { + if (!msk(i,j,k)) tag(i,j,k) = TagBox::CLEAR; + }); + } + + std::swap(*this, tmp); + } +} - tmp.ParallelAdd(*this, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity()); +void +TagBoxArray::local_collate_cpu (Vector& v) const +{ + if (this->local_size() == 0) return; - // We need to keep tags in periodic boundary - const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect()); + Vector count(this->local_size()); #ifdef _OPENMP -#pragma omp parallel if (Gpu::notInLaunchRegion()) +#pragma omp parallel #endif - for (MFIter mfi(tmp); mfi.isValid(); ++mfi) { - Box const& box = mfi.fabbox(); - Array4 const& tag = tmp.array(mfi); - Array4 const& msk = owner_mask->const_array(mfi); - AMREX_HOST_DEVICE_FOR_3D(box, i, j, k, + for (MFIter fai(*this); fai.isValid(); ++fai) + { + Array4 const& arr = this->const_array(fai); + Box const& bx = fai.fabbox(); + int c = 0; + AMREX_LOOP_3D(bx,i,j,k, { - if (!msk(i,j,k)) tag(i,j,k) = TagBox::CLEAR; + if (arr(i,j,k) != TagBox::CLEAR) ++c; }); + count[fai.LocalIndex()] = c; } - std::swap(*this, tmp); -} + Vector offset(count.size()+1); + offset[0] = 0; + std::partial_sum(count.begin(), count.end(), offset.begin()+1); -Long -TagBoxArray::numTags () const -{ - Long ntag = 0; + v.resize(offset.back()); - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu + if (v.empty()) return; #ifdef _OPENMP -#pragma omp parallel reduction(+:ntag) +#pragma omp parallel #endif - for (MFIter mfi(*this); mfi.isValid(); ++mfi) + for (MFIter fai(*this); fai.isValid(); ++fai) { - ntag += get(mfi).numTags(); + int li = fai.LocalIndex(); + if (count[li] > 0) { + IntVect* p = v.data() + offset[li]; + Array4 const& arr = this->const_array(fai); + Box const& bx = fai.fabbox(); + AMREX_LOOP_3D(bx,i,j,k, + { + if (arr(i,j,k) != TagBox::CLEAR) { + *p++ = IntVect(AMREX_D_DECL(i,j,k)); + } + }); + } } - - ParallelDescriptor::ReduceLongSum(ntag); - - return ntag; } +#ifdef AMREX_USE_GPU void -TagBoxArray::collate (Vector& TheGlobalCollateSpace) const +TagBoxArray::local_collate_gpu (Vector& v) const { - BL_PROFILE("TagBoxArray::collate()"); + const int nfabs = this->local_size(); + if (nfabs == 0) return; - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu + constexpr int block_size = 128; + Vector nblocks(nfabs); + for (MFIter fai(*this); fai.isValid(); ++fai) + { + Box const& bx = fai.fabbox(); + nblocks[fai.LocalIndex()] = (bx.numPts() + block_size-1) / block_size; + } + Vector blockoffset(nblocks.size()+1); + blockoffset[0] = 0; + std::partial_sum(nblocks.begin(), nblocks.end(), blockoffset.begin()+1); + int ntotblocks = blockoffset.back(); - Long count = 0; + PODVector > dv_ntags(ntotblocks); -#ifdef _OPENMP -#pragma omp parallel reduction(+:count) -#endif for (MFIter fai(*this); fai.isValid(); ++fai) { - count += get(fai).numTags(); + const int li = fai.LocalIndex(); + int* ntags = dv_ntags.data() + blockoffset[li]; + const int ncells = fai.fabbox().numPts(); + const char* tags = (*this)[fai].dataPtr(); +#ifdef AMREX_USE_DPCPP + amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size, + Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept + { + int bid = h.item.get_group_linear_id(); + int tid = h.item.get_local_id(0); + int icell = h.item.get_global_id(0); + + int t = 0; + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + t = 1; + } + + t = Gpu::blockReduce + (t, Gpu::warpReduce >(), 0, h); + if (tid == 0) { + ntags[bid] = t; + } + }); +#else + amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept + { + int bid = blockIdx.x; + int tid = threadIdx.x; + int icell = blockDim.x*blockIdx.x+threadIdx.x; + + int t = 0; + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + t = 1; + } + + t = Gpu::blockReduce + (t, Gpu::warpReduce >(), 0); + if (tid == 0) { + ntags[bid] = t; + } + }); +#endif } - // - // Local space for holding just those tags we want to gather to the root cpu. - // - Vector TheLocalCollateSpace(count); + PODVector > hv_ntags(ntotblocks); + Gpu::dtoh_memcpy(hv_ntags.data(), dv_ntags.data(), ntotblocks*sizeof(int)); + + PODVector > hv_tags_offset(ntotblocks+1); + hv_tags_offset[0] = 0; + std::partial_sum(hv_ntags.begin(), hv_ntags.end(), hv_tags_offset.begin()+1); + int ntotaltags = hv_tags_offset.back(); + + if (ntotaltags == 0) return; - count = 0; + PODVector > dv_tags_offset(ntotblocks); + int* dp_tags_offset = dv_tags_offset.data(); + Gpu::htod_memcpy(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int)); +#ifdef AMREX_USE_DPCPP + Gpu::synchronize(); +#endif + + PODVector > dv_tags(ntotaltags); + IntVect* dp_tags = dv_tags.data(); - // unsafe to do OMP + int iblock = 0; for (MFIter fai(*this); fai.isValid(); ++fai) { - count += get(fai).collate(TheLocalCollateSpace,count); + const int li = fai.LocalIndex(); + int iblock_begin = iblock; + int iblock_end = iblock + nblocks[li]; + iblock = iblock_end; + int count = 0; + for (int ib = iblock_begin; ib < iblock_end; ++ib) { + count += hv_ntags[ib]; + } + if (count > 0) { + Box const& bx = fai.fabbox(); + const auto lo = amrex::lbound(bx); + const auto len = amrex::length(bx); + const int ncells = bx.numPts(); + const char* tags = (*this)[fai].dataPtr(); +#ifdef AMREX_USE_DPCPP + amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept + { + int bid = h.item.get_group(0); + int tid = h.item.get_local_id(0); + int icell = h.item.get_global_id(0); + + unsigned int* shared_counter = (unsigned int*)h.local; + if (tid == 0) { + *shared_counter = 0; + } + h.item.barrier(sycl::access::fence_space::local_space); + + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + unsigned int itag = Gpu::Atomic::Inc + (shared_counter, 20480u); + IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + p[itag] = IntVect(AMREX_D_DECL(i,j,k)); + } + }); +#else + amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept + { + int bid = blockIdx.x; + int tid = threadIdx.x; + int icell = blockDim.x*blockIdx.x+threadIdx.x; + + Gpu::SharedMemory gsm; + unsigned int * shared_counter = gsm.dataPtr(); + if (tid == 0) { + *shared_counter = 0; + } + __syncthreads(); + + if (icell < ncells && tags[icell] != TagBox::CLEAR) { + unsigned int itag = Gpu::Atomic::Inc(shared_counter, blockDim.x); + IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + p[itag] = IntVect(AMREX_D_DECL(i,j,k)); + } + }); +#endif + } } + v.resize(ntotaltags); + Gpu::dtoh_memcpy(v.data(), dp_tags, ntotaltags*sizeof(IntVect)); +} +#endif + +void +TagBoxArray::collate (Vector& TheGlobalCollateSpace) const +{ + BL_PROFILE("TagBoxArray::collate()"); + + Vector TheLocalCollateSpace; +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + local_collate_gpu(TheLocalCollateSpace); + } else +#endif + { + local_collate_cpu(TheLocalCollateSpace); + } + + Long count = TheLocalCollateSpace.size(); + // // The total number of tags system wide that must be collated. // @@ -501,60 +643,68 @@ TagBoxArray::collate (Vector& TheGlobalCollateSpace) const } void -TagBoxArray::setVal (const BoxList& bl, - TagBox::TagVal val) +TagBoxArray::setVal (const BoxList& bl, TagBox::TagVal val) { BoxArray ba(bl); setVal(ba,val); } void -TagBoxArray::setVal (const BoxDomain& bd, - TagBox::TagVal val) +TagBoxArray::setVal (const BoxDomain& bd, TagBox::TagVal val) { setVal(bd.boxList(),val); } void -TagBoxArray::setVal (const BoxArray& ba, - TagBox::TagVal val) +TagBoxArray::setVal (const BoxArray& ba, TagBox::TagVal val) { - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu - + Vector > tags; + bool run_on_gpu = Gpu::inLaunchRegion(); #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel if (!run_on_gpu) #endif - for (MFIter mfi(*this); mfi.isValid(); ++mfi) { std::vector< std::pair > isects; - - ba.intersections(mfi.fabbox(),isects); - - TagBox& tags = get(mfi); - - for (int i = 0, N = isects.size(); i < N; i++) + for (MFIter mfi(*this); mfi.isValid(); ++mfi) { - tags.setVal(val,isects[i].second,0); + TagBox& fab = (*this)[mfi]; + Array4 const& arr = this->array(mfi); + ba.intersections(mfi.fabbox(), isects); + for (const auto& is : isects) { + Box const& b = is.second; + if (run_on_gpu) { + tags.push_back({arr,b}); + } else { + fab.setVal(val,b); + } + } } } + +#ifdef AMREX_USE_GPU + amrex::ParallelFor(tags, 1, + [=] AMREX_GPU_DEVICE (int i, int j, int k, int /*n*/, Array4 const& a) noexcept + { + a(i,j,k) = val; + }); +#endif } void TagBoxArray::coarsen (const IntVect & ratio) { - // If team is used, all team workers need to go through all the fabs, including ones they don't own. + // If team is used, all team workers need to go through all the fabs, + // including ones they don't own. int teamsize = ParallelDescriptor::TeamSize(); unsigned char flags = (teamsize == 1) ? 0 : MFIter::AllBoxes; - Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu - IntVect new_n_grow; for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { new_n_grow[idim] = (n_grow[idim]+ratio[idim]-1)/ratio[idim]; } #if defined(_OPENMP) -#pragma omp parallel if (teamsize == 1) +#pragma omp parallel if (teamsize == 1 && Gpu::notInLaunchRegion()) #endif for (MFIter mfi(*this,flags); mfi.isValid(); ++mfi) { diff --git a/Src/Base/AMReX_AsyncOut.cpp b/Src/Base/AMReX_AsyncOut.cpp index d289e2fcdd1..66bf2727f90 100644 --- a/Src/Base/AMReX_AsyncOut.cpp +++ b/Src/Base/AMReX_AsyncOut.cpp @@ -11,7 +11,13 @@ namespace AsyncOut { namespace { +#ifdef AMREX_USE_DPCPP +int s_asyncout = true; // Have this on by default for DPC++ for now so that + // I/O writing plotfile does not depend on unified + // memory. +#else int s_asyncout = false; +#endif int s_noutfiles = 64; MPI_Comm s_comm = MPI_COMM_NULL; diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H index e1e0c6a52c7..d3eec81fe1a 100644 --- a/Src/Base/AMReX_GpuAtomic.H +++ b/Src/Base/AMReX_GpuAtomic.H @@ -292,15 +292,14 @@ namespace detail { // Inc //////////////////////////////////////////////////////////////////////// - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +#ifdef AMREX_USE_DPCPP + template + AMREX_FORCE_INLINE unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return atomicInc(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; - constexpr auto as = sycl::access::address_space::global_space; - sycl::atomic a{sycl::multi_ptr(m)}; + sycl::atomic a{sycl::multi_ptr(m)}; unsigned int oldi = a.load(mo), newi; do { newi = (oldi >= value) ? 0u : (oldi+1u); @@ -312,6 +311,19 @@ namespace detail { return old; #endif } +#else + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return atomicInc(m, value); +#else + auto const old = *m; + *m = (old >= value) ? 0u : (old+1u); + return old; +#endif + } +#endif //////////////////////////////////////////////////////////////////////// // Dec diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H index 8d11e3060c6..37f4af6d66c 100644 --- a/Src/Base/AMReX_GpuDevice.H +++ b/Src/Base/AMReX_GpuDevice.H @@ -239,6 +239,7 @@ dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept } catch (sycl::exception const& ex) { amrex::Abort(std::string("dtoh_memcpy: ")+ex.what()+"!!!!!"); } + Gpu::synchronize(); // To mimic cuda behavior #else AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipMemcpy(p_h, p_d, sz, hipMemcpyDeviceToHost));,