diff --git a/Src/AmrCore/AMReX_TagBox.H b/Src/AmrCore/AMReX_TagBox.H
index f6adb04577b..2053dd4998c 100644
--- a/Src/AmrCore/AMReX_TagBox.H
+++ b/Src/AmrCore/AMReX_TagBox.H
@@ -62,41 +62,19 @@ public:
 
     /**
     * \brief Mark neighbors of every tagged cell a distance nbuff away
-    * only search interior for initial tagged points where nwid
-    * is given as the width of the bndry region.
     *
     * \param nbuff
-    * \param nwid
     */
-    void buffer (const IntVect& nbuf, const IntVect& nwid) noexcept;
-
-    /**
-    * \brief Add location of every tagged cell to IntVect array,
-    * starting at given location.  Returns the number of
-    * collated points.
-    *
-    * \param ar
-    * \param start
-    */
-    Long collate (Vector<IntVect>& ar, int start) const noexcept;
-
-    /**
-    * \brief Returns number of tagged cells in specified Box.
-    *
-    * \param bx
-    */
-    Long numTags (const Box& bx) const noexcept;
-
-    /**
-    * \brief Returns total number of tagged cells in the TagBox.
-    */
-    Long numTags () const noexcept;
+    void buffer (const IntVect& nbuf) noexcept;
 
     /**
     * \brief Returns Vector\<int\> of size domain.numPts() suitable for calling
     * Fortran, with positions set to same value as in the TagBox
     * dataPtr().
     */
+//#if (__cplusplus >= 201402L)
+//    [[deprecated("No need to use this unless calling Fortran < 2003")]]
+//#endif
     Vector<int> tags () const noexcept;
 
     /**
@@ -107,6 +85,9 @@ public:
     * \param ar
     * \param tilebx
     */
+//#if (__cplusplus >= 201402L)
+//    [[deprecated("No need to use this unless calling Fortran < 2003")]]
+//#endif
     void get_itags(Vector<int>& ar, const Box& tilebx) const noexcept;
 
     /**
@@ -115,6 +96,9 @@ public:
     *
     * \param ar
     */
+//#if (__cplusplus >= 201402L)
+//    [[deprecated("No need to use this unless calling Fortran < 2003")]]
+//#endif
     void tags (const Vector<int>& ar) noexcept;
 
     /**
@@ -123,6 +107,9 @@ public:
     *
     * \param ar
     */
+//#if (__cplusplus >= 201402L)
+//    [[deprecated("No need to use this unless calling Fortran < 2003")]]
+//#endif
     void tags_and_untags (const Vector<int>& ar) noexcept;
 
     /**
@@ -132,6 +119,9 @@ public:
     * \param ar
     * \param tilebx
     */
+//#if (__cplusplus >= 201402L)
+//    [[deprecated("No need to use this unless calling Fortran < 2003")]]
+//#endif
     void tags (const Vector<int>& ar, const Box& tilebx) noexcept;
 
     /**
@@ -141,6 +131,9 @@ public:
     * \param ar
     * \param tilebx
     */
+//#if (__cplusplus >= 201402L)
+//    [[deprecated("No need to use this unless calling Fortran < 2003")]]
+//#endif
     void tags_and_untags (const Vector<int>& ar, const Box& tilebx) noexcept;
 };
 
@@ -225,17 +218,17 @@ public:
     */
     void coarsen (const IntVect& ratio);
 
-    /**
-    * \brief The total number of Tags in all the contained TagBoxes.
-    */
-    Long numTags () const;
-
     /**
     * \brief Calls collate() on all contained TagBoxes.
     *
     * \param TheGlobalCollateSpace
     */
     void collate (Vector<IntVect>& TheGlobalCollateSpace) const;
+
+    void local_collate_cpu (Vector<IntVect>& v) const;
+#ifdef AMREX_USE_GPU
+    void local_collate_gpu (Vector<IntVect>& v) const;
+#endif
 };
 
 }
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 550b47e66a6..2ac3163e0cc 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -9,6 +9,7 @@
 #include <AMReX_BLProfiler.H>
 #include <AMReX_ccse-mpi.H>
 #include <AMReX_iMultiFab.H>
+#include <AMReX_MultiFabUtil.H>
 
 namespace amrex {
 
@@ -20,15 +21,11 @@ TagBox::TagBox (Arena* ar) noexcept
 
 TagBox::TagBox (const Box& bx, int n, Arena* ar)
     : BaseFab<TagBox::TagType>(bx,n,ar)
-{
-    setVal<RunOn::Host>(TagBox::CLEAR);
-}
+{}
 
 TagBox::TagBox (const Box& bx, int n, bool alloc, bool shared, Arena* ar)
     : BaseFab<TagBox::TagType>(bx,n,alloc,shared,ar)
-{
-    if (alloc) setVal<RunOn::Host>(TagBox::CLEAR);
-}
+{}
 
 TagBox::TagBox (const TagBox& rhs, MakeType make_type, int scomp, int ncomp)
     : BaseFab<TagBox::TagType>(rhs,make_type,scomp,ncomp)
@@ -37,146 +34,74 @@ TagBox::TagBox (const TagBox& rhs, MakeType make_type, int scomp, int ncomp)
 void
 TagBox::coarsen (const IntVect& ratio, const Box& cbox) noexcept
 {
-    // xxxxx TODO: gpu
-
     BL_ASSERT(nComp() == 1);
     Array4<char const> const& farr = this->const_array();
 
-    TagBox cfab(cbox, 1, The_Cpu_Arena());
+    TagBox cfab(cbox, 1, The_Arena());
+    Elixir eli = cfab.elixir();
     Array4<char> const& carr = cfab.array();
 
-    const auto flo = amrex::lbound(domain);
-    const auto fhi = amrex::ubound(domain);
+    Box fdomain = domain;
     Dim3 r{1,1,1};
     AMREX_D_TERM(r.x = ratio[0];, r.y = ratio[1];, r.z = ratio[2]);
 
-    for (int k = flo.z; k <= fhi.z; ++k) {
-        int kc = amrex::coarsen(k,r.z);
-        for (int j = flo.y; j <= fhi.y; ++j) {
-            int jc = amrex::coarsen(j,r.y);
-            for (int i = flo.x; i <= fhi.x; ++i) {
-                int ic = amrex::coarsen(i,r.x);
-                carr(ic,jc,kc) = carr(ic,jc,kc) || farr(i,j,k);
-            }
-        }
-    }
-
-    std::memcpy(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts());
-    this->domain = cbox;
-}
-
-void 
-TagBox::buffer (const IntVect& nbuff, const IntVect& nwid) noexcept
-{
-    //
-    // Note: this routine assumes cell with TagBox::SET tag are in
-    // interior of tagbox (region = grow(domain,-nwid)).
-    //
-    Box inside(domain);
-    inside.grow(-nwid);
-    const int* inlo = inside.loVect();
-    const int* inhi = inside.hiVect();
-
-    int klo = 0, khi = 0, jlo = 0, jhi = 0, ilo, ihi;
-    AMREX_D_TERM(ilo=inlo[0]; ihi=inhi[0]; ,
-                 jlo=inlo[1]; jhi=inhi[1]; ,
-                 klo=inlo[2]; khi=inhi[2];)
-
-    int ni = 0, nj = 0, nk = 0;
-    AMREX_D_TERM(ni=nbuff[0];, nj=nbuff[1];, nk=nbuff[2];)
-
-    IntVect d_length = domain.size();
-    const int* len = d_length.getVect();
-    const int* lo = domain.loVect();
-    TagType* d = dataPtr();
-
-    amrex::ignore_unused(len);
-#define OFF(i,j,k,lo,len) AMREX_D_TERM(i-lo[0], +(j-lo[1])*len[0] , +(k-lo[2])*len[0]*len[1])
-   
-    for (int k = klo; k <= khi; k++)
+    AMREX_HOST_DEVICE_FOR_3D(cbox, i, j, k,
     {
-        for (int j = jlo; j <= jhi; j++)
-        {
-            for (int i = ilo; i <= ihi; i++)
-            {
-                TagType* d_check = d + OFF(i,j,k,lo,len);
-                if (*d_check == TagBox::SET)
-                {
-                    for (int kk = -nk; kk <= nk; kk++)
-                    {
-                        for (int jj = -nj; jj <= nj; jj++)
-                        {
-                            for (int ii = -ni; ii <= ni; ii++)
-                            {
-                                TagType* dn = d_check+ AMREX_D_TERM(ii, +jj*len[0], +kk*len[0]*len[1]);
-                                if (*dn !=TagBox::SET)
-                                    *dn = TagBox::BUF;
-                            }
-                        }
+        TagType t = TagBox::CLEAR;
+        for (int koff = 0; koff < r.z; ++koff) {
+            int kk = k*r.z + koff;
+            for (int joff = 0; joff < r.y; ++joff) {
+                int jj = j*r.y + joff;
+                for (int ioff = 0; ioff < r.x; ++ioff) {
+                    int ii = i*r.x + ioff;
+                    if (fdomain.contains(IntVect(AMREX_D_DECL(ii,jj,kk)))) {
+                        t = t || farr(ii,jj,kk);
                     }
                 }
             }
         }
-    }
-#undef OFF
-}
+        carr(i,j,k) = t;
+    });
 
-Long
-TagBox::numTags () const noexcept
-{
-    Long nt = 0L;
-    Long len = domain.numPts();
-    const TagType* d = dataPtr();
-    for (Long n = 0; n < len; ++n)
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        Gpu::dtod_memcpy_async(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts());
+    } else
+#endif
     {
-        if (d[n] != TagBox::CLEAR)
-            ++nt;
+        std::memcpy(this->dataPtr(), cfab.dataPtr(), sizeof(TagType)*cbox.numPts());
     }
-    return nt;
-}
-
-Long
-TagBox::numTags (const Box& b) const noexcept
-{
-   TagBox tempTagBox(b,1);
-   tempTagBox.copy<RunOn::Host>(*this);
-   return tempTagBox.numTags();
+    this->domain = cbox;
 }
 
-Long
-TagBox::collate (Vector<IntVect>& ar, int start) const noexcept
+void
+TagBox::buffer (const IntVect& a_nbuff) noexcept
 {
-    BL_ASSERT(start >= 0);
-    //
-    // Starting at given offset of array ar, enter location (IntVect) of
-    // each tagged cell in tagbox.
-    //
-    Long count       = 0;
-    IntVect d_length = domain.size();
-    const int* len   = d_length.getVect();
-    const int* lo    = domain.loVect();
-    const TagType* d = dataPtr();
-    int ni = 1, nj = 1, nk = 1;
-    AMREX_D_TERM(ni = len[0]; , nj = len[1]; , nk = len[2];)
-
-    for (int k = 0; k < nk; k++)
+    Array4<char> const& a = this->array();
+    Dim3 nbuf = a_nbuff.dim3();
+    const auto lo = amrex::lbound(domain);
+    const auto hi = amrex::ubound(domain);
+    AMREX_HOST_DEVICE_FOR_3D(domain, i, j, k,
     {
-        for (int j = 0; j < nj; j++)
-        {
-            for (int i = 0; i < ni; i++)
-            {
-                const TagType* dn = d + AMREX_D_TERM(i, +j*len[0], +k*len[0]*len[1]);
-                if (*dn != TagBox::CLEAR)
-                {
-                    ar[start++] = IntVect(AMREX_D_DECL(lo[0]+i,lo[1]+j,lo[2]+k));
-                    count++;
-                }
-            }
+        if (a(i,j,k) == TagBox::CLEAR) {
+            bool to_buf = false;
+            int imin = amrex::max(i-nbuf.x, lo.x);
+            int jmin = amrex::max(j-nbuf.y, lo.y);
+            int kmin = amrex::max(k-nbuf.z, lo.z);
+            int imax = amrex::min(i+nbuf.x, hi.x);
+            int jmax = amrex::min(j+nbuf.y, hi.y);
+            int kmax = amrex::min(k+nbuf.z, hi.z);
+            for (int kk = kmin; kk <= kmax && !to_buf; ++kk) {
+            for (int jj = jmin; jj <= jmax && !to_buf; ++jj) {
+            for (int ii = imin; ii <= imax && !to_buf; ++ii) {
+                if (a(ii,jj,kk) == TagBox::SET) to_buf = true;
+            }}}
+            if (to_buf) a(i,j,k) = TagBox::BUF;
         }
-    }
-    return count;
+    });
 }
 
+// DEPRECATED
 Vector<int>
 TagBox::tags () const noexcept
 {
@@ -194,7 +119,7 @@ TagBox::tags () const noexcept
     return ar;
 }
 
-
+// DEPRECATED
 // Set values as specified by the array -- this only tags.
 // It's an error if ar.length() != domain.numPts().
 void
@@ -212,6 +137,7 @@ TagBox::tags (const Vector<int>& ar) noexcept
     }
 }
 
+// DEPRECATED
 // Set values as specified by the array -- this tags and untags.
 // It's an error if ar.length() != domain.numPts().
 void
@@ -229,6 +155,7 @@ TagBox::tags_and_untags (const Vector<int>& ar) noexcept
     }
 }
 
+// DEPRECATED
 // Since a TagBox is a BaseFab<char>, we can use this utility
 // function to allocate an integer array to have the same number 
 // of elements as cells in tilebx
@@ -271,6 +198,7 @@ TagBox::get_itags(Vector<int>& ar, const Box& tilebx) const noexcept
     }
 }
 
+// DEPRECATED
 // Set values as specified by the array -- this only tags.
 // only changes values in the tilebx region
 void 
@@ -304,6 +232,7 @@ TagBox::tags (const Vector<int>& ar, const Box& tilebx) noexcept
     }
 }
 
+// DEPRECATED
 // Set values as specified by the array -- this tags and untags.
 // only changes values in the tilebx region
 void 
@@ -343,7 +272,7 @@ TagBoxArray::TagBoxArray (const BoxArray& ba,
     :
     FabArray<TagBox>(ba,dm,1,_ngrow,MFInfo(),DefaultFabFactory<TagBox>())
 {
-    if (SharedMemory()) setVal(TagBox::CLEAR);
+    setVal(TagBox::CLEAR);
 }
 
 TagBoxArray::TagBoxArray (const BoxArray& ba,
@@ -352,23 +281,22 @@ TagBoxArray::TagBoxArray (const BoxArray& ba,
     :
     FabArray<TagBox>(ba,dm,1,_ngrow,MFInfo(),DefaultFabFactory<TagBox>())
 {
-    if (SharedMemory()) setVal(TagBox::CLEAR);
+    setVal(TagBox::CLEAR);
 }
 
 void 
 TagBoxArray::buffer (const IntVect& nbuf)
 {
-    Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu
-
     AMREX_ASSERT(nbuf.allLE(n_grow));
 
     if (nbuf.max() > 0)
     {
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-       for (MFIter mfi(*this); mfi.isValid(); ++mfi)
-           get(mfi).buffer(nbuf, n_grow);
+       for (MFIter mfi(*this); mfi.isValid(); ++mfi) {
+           get(mfi).buffer(nbuf);
+       }
     }
 }
 
@@ -377,80 +305,294 @@ TagBoxArray::mapPeriodicRemoveDuplicates (const Geometry& geom)
 {
     BL_PROFILE("TagBoxArray::mapPRD");
 
-    Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu
+    if (Gpu::inLaunchRegion())
+    {
+        // There is not atomicAdd for char.  So we have to use int.
+        iMultiFab itag = amrex::cast<iMultiFab>(*this);
+        iMultiFab tmp(boxArray(),DistributionMap(),1,nGrowVect());
+        tmp.setVal(0);
+        tmp.ParallelAdd(itag, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity());
+
+        // We need to keep tags in periodic boundary
+        const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect());
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+        for (MFIter mfi(tmp); mfi.isValid(); ++mfi) {
+            Box const& box = mfi.fabbox();
+            Array4<TagType> const& tag =this->array(mfi);
+            Array4<int const> const& tmptag = tmp.const_array(mfi);
+            Array4<int const> const& msk = owner_mask->const_array(mfi);
+            amrex::ParallelFor(box,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                if (msk(i,j,k)) {
+                    tag(i,j,k) = static_cast<char>(tmptag(i,j,k));
+                } else {
+                    tag(i,j,k) = TagBox::CLEAR;
+                }
+            });
+        }
+    }
+    else
+    {
+        TagBoxArray tmp(boxArray(),DistributionMap(),nGrowVect()); // note that tmp is filled w/ CLEAR.
+        tmp.ParallelAdd(*this, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity());
 
-    TagBoxArray tmp(boxArray(),DistributionMap(),nGrowVect()); // note that tmp is filled w/ CLEAR.
+        // We need to keep tags in periodic boundary
+        const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect());
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+        for (MFIter mfi(tmp); mfi.isValid(); ++mfi) {
+            Box const& box = mfi.fabbox();
+            Array4<TagType> const& tag = tmp.array(mfi);
+            Array4<int const> const& msk = owner_mask->const_array(mfi);
+            AMREX_LOOP_3D(box, i, j, k,
+            {
+                if (!msk(i,j,k)) tag(i,j,k) = TagBox::CLEAR;
+            });
+        }
+
+        std::swap(*this, tmp);
+    }
+}
 
-    tmp.ParallelAdd(*this, 0, 0, 1, nGrowVect(), nGrowVect(), geom.periodicity());
+void
+TagBoxArray::local_collate_cpu (Vector<IntVect>& v) const
+{
+    if (this->local_size() == 0) return;
 
-    // We need to keep tags in periodic boundary
-    const auto owner_mask = amrex::OwnerMask(tmp, Periodicity::NonPeriodic(), nGrowVect());
+    Vector<int> count(this->local_size());
 #ifdef _OPENMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
+#pragma omp parallel
 #endif
-    for (MFIter mfi(tmp); mfi.isValid(); ++mfi) {
-        Box const& box = mfi.fabbox();
-        Array4<TagType> const& tag = tmp.array(mfi);
-        Array4<int const> const& msk = owner_mask->const_array(mfi);
-        AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
+    for (MFIter fai(*this); fai.isValid(); ++fai)
+    {
+        Array4<char const> const& arr = this->const_array(fai);
+        Box const& bx = fai.fabbox();
+        int c = 0;
+        AMREX_LOOP_3D(bx,i,j,k,
         {
-            if (!msk(i,j,k)) tag(i,j,k) = TagBox::CLEAR;
+            if (arr(i,j,k) != TagBox::CLEAR) ++c;
         });
+        count[fai.LocalIndex()] = c;
     }
 
-    std::swap(*this, tmp);
-}
+    Vector<int> offset(count.size()+1);
+    offset[0] = 0;
+    std::partial_sum(count.begin(), count.end(), offset.begin()+1);
 
-Long
-TagBoxArray::numTags () const
-{
-    Long ntag = 0;
+    v.resize(offset.back());
 
-    Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu
+    if (v.empty()) return;
 
 #ifdef _OPENMP
-#pragma omp parallel reduction(+:ntag)
+#pragma omp parallel
 #endif
-    for (MFIter mfi(*this); mfi.isValid(); ++mfi)
+    for (MFIter fai(*this); fai.isValid(); ++fai)
     {
-        ntag += get(mfi).numTags();
+        int li = fai.LocalIndex();
+        if (count[li] > 0) {
+            IntVect* p = v.data() + offset[li];
+            Array4<char const> const& arr = this->const_array(fai);
+            Box const& bx = fai.fabbox();
+            AMREX_LOOP_3D(bx,i,j,k,
+            {
+                if (arr(i,j,k) != TagBox::CLEAR) {
+                    *p++ = IntVect(AMREX_D_DECL(i,j,k));
+                }
+            });
+        }
     }
-    
-    ParallelDescriptor::ReduceLongSum(ntag);
-    
-    return ntag;
 }
 
+#ifdef AMREX_USE_GPU
 void
-TagBoxArray::collate (Vector<IntVect>& TheGlobalCollateSpace) const
+TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
 {
-    BL_PROFILE("TagBoxArray::collate()");
+    const int nfabs = this->local_size();
+    if (nfabs == 0) return;
 
-    Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu
+    constexpr int block_size = 128;
+    Vector<int> nblocks(nfabs);
+    for (MFIter fai(*this); fai.isValid(); ++fai)
+    {
+        Box const& bx = fai.fabbox();
+        nblocks[fai.LocalIndex()] = (bx.numPts() + block_size-1) / block_size;
+    }
+    Vector<int> blockoffset(nblocks.size()+1);
+    blockoffset[0] = 0;
+    std::partial_sum(nblocks.begin(), nblocks.end(), blockoffset.begin()+1);
+    int ntotblocks = blockoffset.back();
 
-    Long count = 0;
+    PODVector<int,DeviceArenaAllocator<int> > dv_ntags(ntotblocks);
 
-#ifdef _OPENMP
-#pragma omp parallel reduction(+:count)
-#endif
     for (MFIter fai(*this); fai.isValid(); ++fai)
     {
-        count += get(fai).numTags();
+        const int li = fai.LocalIndex();
+        int* ntags = dv_ntags.data() + blockoffset[li];
+        const int ncells = fai.fabbox().numPts();
+        const char* tags = (*this)[fai].dataPtr();
+#ifdef AMREX_USE_DPCPP
+        amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size,
+                      Gpu::Device::gpuStream(),
+        [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
+        {
+            int bid = h.item.get_group_linear_id();
+            int tid = h.item.get_local_id(0);
+            int icell = h.item.get_global_id(0);
+
+            int t = 0;
+            if (icell < ncells && tags[icell] != TagBox::CLEAR) {
+                t = 1;
+            }
+
+            t = Gpu::blockReduce<Gpu::Device::warp_size>
+                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, h);
+            if (tid == 0) {
+                ntags[bid] = t;
+            }
+        });
+#else
+        amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept
+        {
+            int bid = blockIdx.x;
+            int tid = threadIdx.x;
+            int icell = blockDim.x*blockIdx.x+threadIdx.x;
+
+            int t = 0;
+            if (icell < ncells && tags[icell] != TagBox::CLEAR) {
+                t = 1;
+            }
+
+            t = Gpu::blockReduce<Gpu::Device::warp_size>
+                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
+            if (tid == 0) {
+                ntags[bid] = t;
+            }
+        });
+#endif
     }
 
-    //
-    // Local space for holding just those tags we want to gather to the root cpu.
-    //
-    Vector<IntVect> TheLocalCollateSpace(count);
+    PODVector<int,PinnedArenaAllocator<int> > hv_ntags(ntotblocks);
+    Gpu::dtoh_memcpy(hv_ntags.data(), dv_ntags.data(), ntotblocks*sizeof(int));
+
+    PODVector<int,PinnedArenaAllocator<int> > hv_tags_offset(ntotblocks+1);
+    hv_tags_offset[0] = 0;
+    std::partial_sum(hv_ntags.begin(), hv_ntags.end(), hv_tags_offset.begin()+1);
+    int ntotaltags = hv_tags_offset.back();
+
+    if (ntotaltags == 0) return;
 
-    count = 0;
+    PODVector<int,DeviceArenaAllocator<int> > dv_tags_offset(ntotblocks);
+    int* dp_tags_offset = dv_tags_offset.data();
+    Gpu::htod_memcpy(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int));
+#ifdef AMREX_USE_DPCPP
+    Gpu::synchronize();
+#endif
+
+    PODVector<IntVect,DeviceArenaAllocator<IntVect> > dv_tags(ntotaltags);
+    IntVect* dp_tags = dv_tags.data();
 
-    // unsafe to do OMP
+    int iblock = 0;
     for (MFIter fai(*this); fai.isValid(); ++fai)
     {
-        count += get(fai).collate(TheLocalCollateSpace,count);
+        const int li = fai.LocalIndex();
+        int iblock_begin = iblock;
+        int iblock_end = iblock + nblocks[li];
+        iblock = iblock_end;
+        int count = 0;
+        for (int ib = iblock_begin; ib < iblock_end; ++ib) {
+            count += hv_ntags[ib];
+        }
+        if (count > 0) {
+            Box const& bx = fai.fabbox();
+            const auto lo  = amrex::lbound(bx);
+            const auto len = amrex::length(bx);
+            const int ncells = bx.numPts();
+            const char* tags = (*this)[fai].dataPtr();
+#ifdef AMREX_USE_DPCPP
+            amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
+            [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
+            {
+                int bid = h.item.get_group(0);
+                int tid = h.item.get_local_id(0);
+                int icell = h.item.get_global_id(0);
+
+                unsigned int* shared_counter = (unsigned int*)h.local;
+                if (tid == 0) {
+                    *shared_counter = 0;
+                }
+                h.item.barrier(sycl::access::fence_space::local_space);
+
+                if (icell < ncells && tags[icell] != TagBox::CLEAR) {
+                    unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
+                        (shared_counter, 20480u);
+                    IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
+                    int k =  icell /   (len.x*len.y);
+                    int j = (icell - k*(len.x*len.y)) /   len.x;
+                    int i = (icell - k*(len.x*len.y)) - j*len.x;
+                    i += lo.x;
+                    j += lo.y;
+                    k += lo.z;
+                    p[itag] = IntVect(AMREX_D_DECL(i,j,k));
+                }
+            });
+#else
+            amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
+            [=] AMREX_GPU_DEVICE () noexcept
+            {
+                int bid = blockIdx.x;
+                int tid = threadIdx.x;
+                int icell = blockDim.x*blockIdx.x+threadIdx.x;
+
+                Gpu::SharedMemory<unsigned int> gsm;
+                unsigned int * shared_counter = gsm.dataPtr();
+                if (tid == 0) {
+                    *shared_counter = 0;
+                }
+                __syncthreads();
+
+                if (icell < ncells && tags[icell] != TagBox::CLEAR) {
+                    unsigned int itag = Gpu::Atomic::Inc(shared_counter, blockDim.x);
+                    IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
+                    int k =  icell /   (len.x*len.y);
+                    int j = (icell - k*(len.x*len.y)) /   len.x;
+                    int i = (icell - k*(len.x*len.y)) - j*len.x;
+                    i += lo.x;
+                    j += lo.y;
+                    k += lo.z;
+                    p[itag] = IntVect(AMREX_D_DECL(i,j,k));
+                }
+            });
+#endif
+        }
     }
 
+    v.resize(ntotaltags);
+    Gpu::dtoh_memcpy(v.data(), dp_tags, ntotaltags*sizeof(IntVect));
+}
+#endif
+
+void
+TagBoxArray::collate (Vector<IntVect>& TheGlobalCollateSpace) const
+{
+    BL_PROFILE("TagBoxArray::collate()");
+
+    Vector<IntVect> TheLocalCollateSpace;
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        local_collate_gpu(TheLocalCollateSpace);
+    } else
+#endif
+    {
+        local_collate_cpu(TheLocalCollateSpace);
+    }
+
+    Long count = TheLocalCollateSpace.size();
+
     //
     // The total number of tags system wide that must be collated.
     //
@@ -501,60 +643,68 @@ TagBoxArray::collate (Vector<IntVect>& TheGlobalCollateSpace) const
 }
 
 void
-TagBoxArray::setVal (const BoxList& bl,
-                     TagBox::TagVal val)
+TagBoxArray::setVal (const BoxList& bl, TagBox::TagVal val)
 {
     BoxArray ba(bl);
     setVal(ba,val);
 }
 
 void
-TagBoxArray::setVal (const BoxDomain& bd,
-                     TagBox::TagVal   val)
+TagBoxArray::setVal (const BoxDomain& bd, TagBox::TagVal val)
 {
     setVal(bd.boxList(),val);
 }
 
 void
-TagBoxArray::setVal (const BoxArray& ba,
-                     TagBox::TagVal  val)
+TagBoxArray::setVal (const BoxArray& ba, TagBox::TagVal val)
 {
-    Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu
-
+    Vector<Array4BoxTag<char> > tags;
+    bool run_on_gpu = Gpu::inLaunchRegion();
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (!run_on_gpu)
 #endif
-    for (MFIter mfi(*this); mfi.isValid(); ++mfi)
     {
         std::vector< std::pair<int,Box> > isects;
-
-        ba.intersections(mfi.fabbox(),isects);
-
-        TagBox& tags = get(mfi);
-
-        for (int i = 0, N = isects.size(); i < N; i++)
+        for (MFIter mfi(*this); mfi.isValid(); ++mfi)
         {
-            tags.setVal<RunOn::Host>(val,isects[i].second,0);
+            TagBox& fab = (*this)[mfi];
+            Array4<char> const& arr = this->array(mfi);
+            ba.intersections(mfi.fabbox(), isects);
+            for (const auto& is : isects) {
+                Box const& b = is.second;
+                if (run_on_gpu) {
+                    tags.push_back({arr,b});
+                } else {
+                   fab.setVal<RunOn::Host>(val,b);
+                }
+            }
         }
     }
+
+#ifdef AMREX_USE_GPU
+    amrex::ParallelFor(tags, 1,
+    [=] AMREX_GPU_DEVICE (int i, int j, int k, int /*n*/, Array4<char> const& a) noexcept
+    {
+        a(i,j,k) = val;
+    });
+#endif
 }
 
 void
 TagBoxArray::coarsen (const IntVect & ratio)
 {
-    // If team is used, all team workers need to go through all the fabs, including ones they don't own.
+    // If team is used, all team workers need to go through all the fabs,
+    // including ones they don't own.
     int teamsize = ParallelDescriptor::TeamSize();
     unsigned char flags = (teamsize == 1) ? 0 : MFIter::AllBoxes;
 
-    Gpu::LaunchSafeGuard lsg(false); // xxxxx TODO: gpu
-
     IntVect new_n_grow;
     for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
         new_n_grow[idim] = (n_grow[idim]+ratio[idim]-1)/ratio[idim];
     }
 
 #if defined(_OPENMP)
-#pragma omp parallel if (teamsize == 1)
+#pragma omp parallel if (teamsize == 1 && Gpu::notInLaunchRegion())
 #endif
     for (MFIter mfi(*this,flags); mfi.isValid(); ++mfi)
     {
diff --git a/Src/Base/AMReX_AsyncOut.cpp b/Src/Base/AMReX_AsyncOut.cpp
index d289e2fcdd1..66bf2727f90 100644
--- a/Src/Base/AMReX_AsyncOut.cpp
+++ b/Src/Base/AMReX_AsyncOut.cpp
@@ -11,7 +11,13 @@ namespace AsyncOut {
 
 namespace {
 
+#ifdef AMREX_USE_DPCPP
+int s_asyncout = true; // Have this on by default for DPC++ for now so that
+                       // I/O writing plotfile does not depend on unified
+                       // memory.
+#else
 int s_asyncout = false;
+#endif
 int s_noutfiles = 64;
 MPI_Comm s_comm = MPI_COMM_NULL;
 
diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
index e1e0c6a52c7..d3eec81fe1a 100644
--- a/Src/Base/AMReX_GpuAtomic.H
+++ b/Src/Base/AMReX_GpuAtomic.H
@@ -292,15 +292,14 @@ namespace detail {
 //  Inc
 ////////////////////////////////////////////////////////////////////////
 
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+#ifdef AMREX_USE_DPCPP
+    template <sycl::access::address_space AS = sycl::access::address_space::global_space>
+    AMREX_FORCE_INLINE
     unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
     {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-        return atomicInc(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
-        constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<unsigned int,as> a{sycl::multi_ptr<unsigned int,as>(m)};
+        sycl::atomic<unsigned int,AS> a{sycl::multi_ptr<unsigned int,AS>(m)};
         unsigned int oldi = a.load(mo), newi;
         do {
             newi = (oldi >= value) ? 0u : (oldi+1u);
@@ -312,6 +311,19 @@ namespace detail {
         return old;
 #endif
     }
+#else
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
+    {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+        return atomicInc(m, value);
+#else
+        auto const old = *m;
+        *m = (old >= value) ? 0u : (old+1u);
+        return old;
+#endif
+    }
+#endif
 
 ////////////////////////////////////////////////////////////////////////
 //  Dec
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index 8d11e3060c6..37f4af6d66c 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -239,6 +239,7 @@ dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
     } catch (sycl::exception const& ex) {
         amrex::Abort(std::string("dtoh_memcpy: ")+ex.what()+"!!!!!");
     }
+    Gpu::synchronize(); // To mimic cuda behavior
 #else
     AMREX_HIP_OR_CUDA(
         AMREX_HIP_SAFE_CALL(hipMemcpy(p_h, p_d, sz, hipMemcpyDeviceToHost));,