Merge pull request #35285 from VinInn/OuterHitOfCell

reduce size of isOuterHitOfCell in patatrack
cms-sw · Oct 1, 2021 · 7e79ae6 · 7e79ae6
2 parents 00f808e + 592ffaa
commit 7e79ae6
Show file tree

Hide file tree

Showing 17 changed files with 94 additions and 59 deletions.
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -18,9 +18,13 @@ class SiPixelClustersCUDA {
   SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default;
   SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default;
 
-  void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; }
+  void setNClusters(uint32_t nClusters, int32_t offsetBPIX2) {
+    nClusters_h = nClusters;
+    offsetBPIX2_h = offsetBPIX2;
+  }
 
   uint32_t nClusters() const { return nClusters_h; }
+  int32_t offsetBPIX2() const { return offsetBPIX2_h; }
 
   uint32_t *moduleStart() { return moduleStart_d.get(); }
   uint32_t *clusInModule() { return clusInModule_d.get(); }
@@ -58,6 +62,7 @@ class SiPixelClustersCUDA {
   cms::cuda::device::unique_ptr<DeviceConstView> view_d;  // "me" pointer
 
   uint32_t nClusters_h = 0;
+  int32_t offsetBPIX2_h = 0;
 };
 
 #endif  // CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -16,6 +16,7 @@ class TrackingRecHit2DHeterogeneous {
 
   explicit TrackingRecHit2DHeterogeneous(
       uint32_t nHits,
+      int32_t offsetBPIX2,
       pixelCPEforGPU::ParamsOnGPU const* cpeParams,
       uint32_t const* hitsModuleStart,
       cudaStream_t stream,
@@ -32,6 +33,7 @@ class TrackingRecHit2DHeterogeneous {
   TrackingRecHit2DSOAView const* view() const { return m_view.get(); }
 
   auto nHits() const { return m_nHits; }
+  auto offsetBPIX2() const { return m_offsetBPIX2; }
 
   auto hitsModuleStart() const { return m_hitsModuleStart; }
   auto hitsLayerStart() { return m_hitsLayerStart; }
@@ -60,6 +62,7 @@ class TrackingRecHit2DHeterogeneous {
   unique_ptr<TrackingRecHit2DSOAView> m_view;  //!
 
   uint32_t m_nHits;
+  int32_t m_offsetBPIX2;
 
   uint32_t const* m_hitsModuleStart;  // needed for legacy, this is on GPU!
 
@@ -80,11 +83,12 @@ using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::Host
 template <typename Traits>
 TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
     uint32_t nHits,
+    int32_t offsetBPIX2,
     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
     uint32_t const* hitsModuleStart,
     cudaStream_t stream,
     TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
-    : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
+    : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
   auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
 
   view->m_nHits = nHits;

diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
@@ -18,11 +18,11 @@ int main() {
   auto nHits = 200;
   // inner scope to deallocate memory before destroying the stream
   {
-    TrackingRecHit2DGPU tkhit(nHits, nullptr, nullptr, stream);
+    TrackingRecHit2DGPU tkhit(nHits, 0, nullptr, nullptr, stream);
 
     testTrackingRecHit2D::runKernels(tkhit.view());
 
-    TrackingRecHit2DHost tkhitH(nHits, nullptr, nullptr, stream, &tkhit);
+    TrackingRecHit2DHost tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit);
     cudaStreamSynchronize(stream);
     assert(tkhitH.view());
     assert(tkhitH.view()->nHits() == unsigned(nHits));

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -452,7 +452,10 @@ namespace pixelgpudetails {
 
   }  // end of Raw to Digi kernel
 
-  __global__ void fillHitsModuleStart(uint32_t const *__restrict__ cluStart, uint32_t *__restrict__ moduleStart) {
+  __global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule,
+                                      uint32_t *__restrict__ moduleStart,
+                                      uint32_t const *__restrict__ nModules,
+                                      uint32_t *__restrict__ nModules_Clusters) {
     assert(gpuClustering::maxNumModules < 2048);  // easy to extend at least till 32*1024
     assert(1 == gridDim.x);
     assert(0 == blockIdx.x);
@@ -461,7 +464,7 @@ namespace pixelgpudetails {
 
     // limit to maxHitsInModule()
     for (int i = first, iend = gpuClustering::maxNumModules; i < iend; i += blockDim.x) {
-      moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), cluStart[i]);
+      moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), clusInModule[i]);
     }
 
     __shared__ uint32_t ws[32];
@@ -473,9 +476,18 @@ namespace pixelgpudetails {
     }
     __syncthreads();
 
+    if (threadIdx.x == 0) {
+      // copy the number of modules
+      nModules_Clusters[0] = *nModules;
+      // last element holds the number of all clusters
+      nModules_Clusters[1] = moduleStart[gpuClustering::maxNumModules];
+      // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
+      nModules_Clusters[2] = moduleStart[96];
+    }
+
 #ifdef GPU_DEBUG
     assert(0 == moduleStart[0]);
-    auto c0 = std::min(gpuClustering::maxHitsInModule(), cluStart[0]);
+    auto c0 = std::min(gpuClustering::maxHitsInModule(), clusInModule[0]);
     assert(c0 == moduleStart[1]);
     assert(moduleStart[1024] >= moduleStart[1023]);
     assert(moduleStart[1025] >= moduleStart[1024]);
@@ -519,8 +531,6 @@ namespace pixelgpudetails {
     }
     clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, stream);
 
-    nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);
-
     if (wordCounter)  // protect in case of empty event....
     {
       const int threadsPerBlock = 512;
@@ -597,10 +607,6 @@ namespace pixelgpudetails {
           digis_d.moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
-      // read the number of modules into a data member, used by getProduct())
-      cudaCheck(cudaMemcpyAsync(
-          &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
-
       threadsPerBlock = 256;
       blocks = maxNumModules;
 #ifdef GPU_DEBUG
@@ -635,16 +641,15 @@ namespace pixelgpudetails {
       // rechits) so that the number of clusters/hits can be made
       // available in the rechit producer without additional points of
       // synchronization/ExternalWork
-
+      auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
       // MUST be ONE block
-      fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.clusInModule(), clusters_d.clusModuleStart());
+      fillHitsModuleStart<<<1, 1024, 0, stream>>>(
+          clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get());
 
-      // last element holds the number of all clusters
-      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]),
-                                clusters_d.clusModuleStart() + gpuClustering::maxNumModules,
-                                sizeof(uint32_t),
-                                cudaMemcpyDefault,
-                                stream));
+      // copy to host
+      nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
+      cudaCheck(cudaMemcpyAsync(
+          nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));
 
 #ifdef GPU_DEBUG
       cudaDeviceSynchronize();

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -114,7 +114,8 @@ namespace pixelgpudetails {
 
     std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
       digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
-      clusters_d.setNClusters(nModules_Clusters_h[1]);
+      assert(nModules_Clusters_h[2] <= nModules_Clusters_h[1]);
+      clusters_d.setNClusters(nModules_Clusters_h[1], nModules_Clusters_h[2]);
       // need to explicitly deallocate while the associated CUDA
       // stream is still alive
       //

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -38,7 +38,7 @@ namespace pixelgpudetails {
                                                           pixelCPEforGPU::ParamsOnGPU const* cpeParams,
                                                           cudaStream_t stream) const {
     auto nHits = clusters_d.nClusters();
-    TrackingRecHit2DGPU hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream);
+    TrackingRecHit2DGPU hits_d(nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
 
     int threadsPerBlock = 128;
     int blocks = digis_d.nModules();  // active modules (with digis)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -141,7 +141,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   assert(numberOfClusters == int(hitsModuleStart[gpuClustering::maxNumModules]));
 
   // output SoA
-  auto output = std::make_unique<TrackingRecHit2DCPU>(numberOfClusters, &cpeView, hitsModuleStart, nullptr);
+  // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
+  auto output =
+      std::make_unique<TrackingRecHit2DCPU>(numberOfClusters, hitsModuleStart[96], &cpeView, hitsModuleStart, nullptr);
 
   if (0 == numberOfClusters) {
     iEvent.put(std::move(output));

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
@@ -73,11 +73,18 @@ namespace caConstants {
   using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
   using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;
 
-  using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
+  using OuterHitOfCellContainer = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
   using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
   using HitToTuple = cms::cuda::OneToManyAssoc<tindex_type, -1, 4 * maxTuples>;  // 3.5 should be enough
   using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples>;
 
+  struct OuterHitOfCell {
+    OuterHitOfCellContainer* container;
+    int32_t offset;
+    constexpr auto& operator[](int i) { return container[i - offset]; }
+    constexpr auto const& operator[](int i) const { return container[i - offset]; }
+  };
+
 }  // namespace caConstants
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -15,15 +15,16 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
-  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+  std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl;
 #endif
 
   // use "nhits" to heuristically dimension the workspace
 
   // no need to use the Traits allocations, since we know this is being compiled for the CPU
   //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
-  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits));
+  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
   assert(device_isOuterHitOfCell_.get());
+  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
 
   auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
                          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
@@ -34,7 +35,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
                                                                                       sizeof(GPUCACell::CellNeighbors));
 
-  gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
+  gpuPixelDoublets::initDoublets(isOuterHitOfCell_,
                                  nhits,
                                  device_theCellNeighbors_.get(),
                                  device_theCellNeighborsContainer_,
@@ -64,7 +65,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                          device_theCellNeighbors_.get(),
                                          device_theCellTracks_.get(),
                                          hh.view(),
-                                         device_isOuterHitOfCell_.get(),
+                                         isOuterHitOfCell_,
                                          nActualPairs,
                                          params_.idealConditions_,
                                          params_.doClusterCut_,
@@ -98,7 +99,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  device_theCells_.get(),
                  device_nCells_,
                  device_theCellNeighbors_.get(),
-                 device_isOuterHitOfCell_.get(),
+                 isOuterHitOfCell_,
                  params_.hardCurvCut_,
                  params_.ptmin_,
                  params_.CAThetaCutBarrel_,
@@ -107,8 +108,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                  params_.dcaCutOuterTriplet_);
 
   if (nhits > 1 && params_.earlyFishbone_) {
-    gpuPixelDoublets::fishbone(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
   }
 
   kernel_find_ntuplets(hh.view(),
@@ -132,8 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());
 
   if (nhits > 1 && params_.lateFishbone_) {
-    gpuPixelDoublets::fishbone(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
   }
 
   if (params_.doStats_) {
@@ -145,7 +144,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
                           device_nCells_,
                           device_theCellNeighbors_.get(),
                           device_theCellTracks_.get(),
-                          device_isOuterHitOfCell_.get(),
+                          isOuterHitOfCell_,
                           nhits,
                           params_.maxNumberOfDoublets_,
                           counters_);

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -54,7 +54,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
       device_theCells_.get(),
       device_nCells_,
       device_theCellNeighbors_.get(),
-      device_isOuterHitOfCell_.get(),
+      isOuterHitOfCell_,
       params_.hardCurvCut_,
       params_.ptmin_,
       params_.CAThetaCutBarrel_,
@@ -67,11 +67,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
     gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+        hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
     cudaCheck(cudaGetLastError());
   }
 
@@ -119,11 +119,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
     auto nthTot = 128;
     auto stride = 16;
     auto blockSize = nthTot / stride;
-    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
     gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+        hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
     cudaCheck(cudaGetLastError());
   }
 
@@ -140,6 +140,8 @@ template <>
 void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   int32_t nhits = hh.nHits();
 
+  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+
 #ifdef NTUPLE_DEBUG
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
 #endif
@@ -150,9 +152,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1, nhits), stream);
+  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCellContainer[]>(
+      std::max(1, nhits - hh.offsetBPIX2()), stream);
   assert(device_isOuterHitOfCell_.get());
 
+  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+
   cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
       caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
           caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
@@ -164,8 +169,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
   {
     int threadsPerBlock = 128;
     // at least one block!
-    int blocks = (std::max(1, nhits) + threadsPerBlock - 1) / threadsPerBlock;
-    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
+    int blocks = (std::max(1, nhits - hh.offsetBPIX2()) + threadsPerBlock - 1) / threadsPerBlock;
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(isOuterHitOfCell_,
                                                                            nhits,
                                                                            device_theCellNeighbors_.get(),
                                                                            device_theCellNeighborsContainer_,
@@ -206,7 +211,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                                                     device_theCellNeighbors_.get(),
                                                                     device_theCellTracks_.get(),
                                                                     hh.view(),
-                                                                    device_isOuterHitOfCell_.get(),
+                                                                    isOuterHitOfCell_,
                                                                     nActualPairs,
                                                                     params_.idealConditions_,
                                                                     params_.doClusterCut_,
@@ -324,7 +329,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
                                                                         device_nCells_,
                                                                         device_theCellNeighbors_.get(),
                                                                         device_theCellTracks_.get(),
-                                                                        device_isOuterHitOfCell_.get(),
+                                                                        isOuterHitOfCell_,
                                                                         nhits,
                                                                         params_.maxNumberOfDoublets_,
                                                                         counters_);

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -205,7 +205,8 @@ class CAHitNtupletGeneratorKernels {
   caConstants::CellTracks* device_theCellTracksContainer_;
 
   unique_ptr<GPUCACell[]> device_theCells_;
-  unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
+  unique_ptr<GPUCACell::OuterHitOfCellContainer[]> device_isOuterHitOfCell_;
+  GPUCACell::OuterHitOfCell isOuterHitOfCell_;
   uint32_t* device_nCells_ = nullptr;
 
   unique_ptr<HitToTuple> device_hitToTuple_;