Merge pull request #37444 from missirol/backport_37281_to_123X

Backport to `12_3_X` of Patatrack pixel-reco updates (pT-clamping in vertex sorting + stabilized Fishbone)
cms-sw · Apr 5, 2022 · 1ed9513 · 1ed9513
2 parents 6e4ac3f + 3c15c2f
commit 1ed9513
Show file tree

Hide file tree

Showing 13 changed files with 114 additions and 60 deletions.
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -45,6 +45,9 @@ class TrackSoAHeterogeneousT {
 
   eigenSoA::ScalarSoA<int8_t, S> nLayers;
 
+  constexpr int nTracks() const { return nTracks_; }
+  constexpr void setNTracks(int n) { nTracks_ = n; }
+
   constexpr int nHits(int i) const { return detIndices.size(i); }
 
   constexpr bool isTriplet(int i) const { return nLayers(i) == 3; }
@@ -80,6 +83,9 @@ class TrackSoAHeterogeneousT {
 
   HitContainer hitIndices;
   HitContainer detIndices;
+
+private:
+  int nTracks_;
 };
 
 namespace pixelTrack {

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -154,16 +154,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
   auto const *quality = tsoa.qualityData();
   auto const &fit = tsoa.stateAtBS;
   auto const &hitIndices = tsoa.hitIndices;
-  auto maxTracks = tsoa.stride();
+  auto nTracks = tsoa.nTracks();
 
-  tracks.reserve(maxTracks);
+  tracks.reserve(nTracks);
 
   int32_t nt = 0;
 
-  for (int32_t it = 0; it < maxTracks; ++it) {
+  for (int32_t it = 0; it < nTracks; ++it) {
     auto nHits = tsoa.nHits(it);
-    if (nHits == 0)
-      break;  // this is a guard: maybe we need to move to nTracks...
+    assert(nHits >= 3);
     indToEdm.push_back(-1);
     auto q = quality[it];
     if (q < minQuality_)

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -17,7 +17,7 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
 // Switch on to enable checks and printout for found tracks
-#undef PIXEL_DEBUG_PRODUCE
+// #define PIXEL_DEBUG_PRODUCE
 
 class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -63,7 +63,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
 #ifdef PIXEL_DEBUG_PRODUCE
   auto const& tsoa = *soa_;
   auto maxTracks = tsoa.stride();
-  std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+  auto nTracks = tsoa.nTracks();
+  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+  std::cout << "found " << nTracks << " tracks in cpu SoA at " << &tsoa << std::endl;
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
@@ -73,7 +75,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
   }
-  std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
+  assert(nTracks == nt);
 #endif
 
   // DO NOT  make a copy  (actually TWO....)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -129,7 +129,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);
 
   kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
-  kernel_fillNLayers(tracks_d);
+  kernel_fillNLayers(tracks_d, device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
@@ -213,7 +213,11 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
 #ifdef DUMP_GPU_TK_TUPLES
   static std::atomic<int> iev(0);
-  ++iev;
-  kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+  static std::mutex lock;
+  {
+    std::lock_guard<std::mutex> guard(lock);
+    ++iev;
+    kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev);
+  }
 #endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,4 +1,5 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
+#include <mutex>
 
 template <>
 void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
@@ -90,7 +91,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
 
   kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
   cudaCheck(cudaGetLastError());
-  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d);
+  kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, device_hitTuple_apc_);
   cudaCheck(cudaGetLastError());
 
   // remove duplicates (tracks that share a doublet)
@@ -330,9 +331,20 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
 
 #ifdef DUMP_GPU_TK_TUPLES
   static std::atomic<int> iev(0);
-  ++iev;
-  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
-      hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
+  static std::mutex lock;
+  {
+    std::lock_guard<std::mutex> guard(lock);
+    ++iev;
+    for (int k = 0; k < 20000; k += 500) {
+      kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
+          hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev);
+      cudaDeviceSynchronize();
+    }
+    kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
+        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev);
+    cudaDeviceSynchronize();
+    // cudaStreamSynchronize(cudaStream);
+  }
 #endif
 }
 

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -21,6 +21,7 @@ namespace cAHitNtupletGenerator {
     unsigned long long nGoodTracks;
     unsigned long long nUsedHits;
     unsigned long long nDupHits;
+    unsigned long long nFishCells;
     unsigned long long nKilledCells;
     unsigned long long nEmptyCells;
     unsigned long long nZeroTrackCells;

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -101,6 +101,8 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
 
   for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
     auto const &thisCell = cells[idx];
+    if (thisCell.hasFishbone() && !thisCell.isKilled())
+      atomicAdd(&c.nFishCells, 1);
     if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
       printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
     if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
@@ -194,7 +196,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
 
     /* chi2 penalize higher-pt tracks  (try rescale it?)
     auto score = [&](auto it) {
-      return tracks->nHits(it) < 4 ? 
+      return tracks->nLayers(it) < 4 ? 
               std::abs(tracks->tip(it)) :  // tip for triplets
               tracks->chi2(it);            //chi2 for quads
     };
@@ -204,7 +206,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,
 
     // full crazy combinatorics
     int ntr = thisCell.tracks().size();
-    for (int i = 0; i < ntr; ++i) {
+    for (int i = 0; i < ntr - 1; ++i) {
       auto it = thisCell.tracks()[i];
       auto qi = tracks->quality(it);
       if (qi <= reject)
@@ -557,13 +559,15 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
   }
 }
 
-__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks) {
+__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
   auto &tracks = *ptracks;
   auto first = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+  auto ntracks = apc->get().m;
+  if (0 == first)
+    tracks.setNTracks(ntracks);
+  for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
     auto nHits = tracks.nHits(idx);
-    if (nHits == 0)
-      break;  // this is a guard: maybe we need to move to nTracks...
+    assert(nHits >= 3);
     tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
   }
 }
@@ -666,7 +670,7 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
     auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
 
     // full combinatorics
-    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+    for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
       auto const it = *ip;
       auto qi = quality[it];
       if (qi <= reject)
@@ -676,7 +680,7 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
       auto cti = tracks.stateAtBS.state(it)(3);
       auto e2cti = tracks.stateAtBS.covariance(it)(12);
       auto nli = tracks.nLayers(it);
-      for (auto jp = ip + 1; jp != hitToTuple.end(idx); ++jp) {
+      for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
         auto const jt = *jp;
         auto qj = quality[jt];
         if (qj <= reject)
@@ -860,19 +864,25 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
                                             TkSoA const *__restrict__ ptracks,
                                             Quality const *__restrict__ quality,
                                             CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
-                                            int32_t maxPrint,
+                                            int32_t firstPrint,
+                                            int32_t lastPrint,
                                             int iev) {
+  constexpr auto loose = pixelTrack::Quality::loose;
+  auto const &hh = *hhp;
   auto const &foundNtuplets = *ptuples;
   auto const &tracks = *ptracks;
-  int first = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = first, np = std::min(maxPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
+  int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
     auto nh = foundNtuplets.size(i);
     if (nh < 3)
       continue;
-    printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
+    if (quality[i] < loose)
+      continue;
+    printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
            10000 * iev + i,
            int(quality[i]),
            nh,
+           tracks.nLayers(i),
            tracks.charge(i),
            tracks.pt(i),
            tracks.eta(i),
@@ -881,11 +891,13 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
            tracks.zip(i),
            //           asinhf(fit_results[i].par(3)),
            tracks.chi2(i),
-           *foundNtuplets.begin(i),
-           *(foundNtuplets.begin(i) + 1),
-           *(foundNtuplets.begin(i) + 2),
-           nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1,
-           nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1);
+           hh.zGlobal(*foundNtuplets.begin(i)),
+           hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
+           hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
+           nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
+           nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
+           nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
+           nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
   }
 }
 
@@ -894,22 +906,24 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
   printf(
       "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
       "nDupHits | "
+      "nFishCells | "
       "nKilledCells | "
       "nUsedCells | nZeroTrackCells ||\n");
-  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
          c.nEvents,
          c.nHits,
          c.nCells,
          c.nTuples,
+         c.nFitTracks,
          c.nLooseTracks,
          c.nGoodTracks,
-         c.nFitTracks,
          c.nUsedHits,
          c.nDupHits,
+         c.nFishCells,
          c.nKilledCells,
          c.nEmptyCells,
          c.nZeroTrackCells);
-  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f||\n",
+  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f|  %.3f||\n",
          c.nEvents,
          c.nHits / double(c.nEvents),
          c.nCells / double(c.nEvents),
@@ -919,6 +933,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
          c.nGoodTracks / double(c.nEvents),
          c.nUsedHits / double(c.nEvents),
          c.nDupHits / double(c.nEvents),
+         c.nFishCells / double(c.nCells),
          c.nKilledCells / double(c.nCells),
          c.nEmptyCells / double(c.nCells),
          c.nZeroTrackCells / double(c.nCells));

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -83,10 +83,11 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
                cfg.getParameter<double>("dcaCutOuterTriplet"),
                makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
 #ifdef DUMP_GPU_TK_TUPLES
-  printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+  printf("TK: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
          "tid",
          "qual",
          "nh",
+         "nl",
          "charge",
          "pt",
          "eta",
@@ -98,7 +99,8 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
          "h2",
          "h3",
          "h4",
-         "h5");
+         "h5",
+         "hn");
 #endif
 
   if (m_params.onGPU_) {

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -22,6 +22,7 @@ class GPUCACell {
   using PtrAsInt = unsigned long long;
 
   static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit;
+  static constexpr auto invalidHitId = std::numeric_limits<caConstants::hindex_type>::max();
   using OuterHitOfCellContainer = caConstants::OuterHitOfCellContainer;
   using OuterHitOfCell = caConstants::OuterHitOfCell;
   using CellNeighbors = caConstants::CellNeighbors;
@@ -52,7 +53,7 @@ class GPUCACell {
     theOuterHitId = outerHitId;
     theLayerPairId_ = layerPairId;
     theStatus_ = 0;
-    theFishboneId = std::numeric_limits<hindex_type>::max();
+    theFishboneId = invalidHitId;
 
     // optimization that depends on access pattern
     theInnerZ = hh.zGlobal(innerHitId);
@@ -342,11 +343,18 @@ class GPUCACell {
   __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); }
   __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); }
 
-  __device__ __forceinline__ void setFishbone(hindex_type id) { theFishboneId = id; }
-  __device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
-  __device__ __forceinline__ bool hasFishbone() const {
-    return theFishboneId != std::numeric_limits<hindex_type>::max();
+  __device__ __forceinline__ void setFishbone(hindex_type id, float z, Hits const& hh) {
+    // make it deterministic: use the farther apart (in z)
+    auto old = theFishboneId;
+    while (
+        old !=
+        atomicCAS(&theFishboneId,
+                  old,
+                  (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh.zGlobal(old) - theInnerZ)) ? id : old))
+      old = theFishboneId;
   }
+  __device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
+  __device__ __forceinline__ bool hasFishbone() const { return theFishboneId != invalidHitId; }
 
 private:
   CellNeighbors* theOuterNeighbors;

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -75,25 +75,25 @@ namespace gpuPixelDoublets {
           // must be different detectors
           //        if (d[ic]==d[jc]) continue;
           auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
-          if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
+          if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * (n[ic] * n[jc])) {
             // alligned:  kill farthest (prefer consecutive layers)
             // if same layer prefer farthest (longer level arm) and make space for intermediate hit
             bool sameLayer = l[ic] == l[jc];
             if (n[ic] > n[jc]) {
               if (sameLayer) {
                 cj.kill();  // closest
-                ci.setFishbone(cj.inner_hit_id());
+                ci.setFishbone(cj.inner_hit_id(), cj.inner_z(hh), hh);
               } else {
                 ci.kill();  // farthest
-                break;
+                // break;  // removed to improve reproducibility. keep it for reference and tests
               }
             } else {
               if (!sameLayer) {
                 cj.kill();  // farthest
               } else {
                 ci.kill();  // closest
-                cj.setFishbone(ci.inner_hit_id());
-                break;
+                cj.setFishbone(ci.inner_hit_id(), ci.inner_z(hh), hh);
+                // break;  // removed to improve reproducibility. keep it for reference    and tests
               }
             }
           }