cms-sw · cmsbuild · Oct 27, 2021 · Oct 5, 2021 · Oct 6, 2021 · Oct 7, 2021
diff --git a/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTService.cc b/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTService.cc
@@ -38,8 +38,7 @@ void SiPixelGainCalibrationForHLTService::calibrate(
     if (isDeadColumn | isNoisyColumn)
       electron[i++] = 0;
     else {
-      float vcal = di->adc() * gain - pedestal * gain;
-      //    float vcal = (di->adc()  - DBpedestal) * DBgain;
+      float vcal = float(di->adc()) * gain - pedestal * gain;
       electron[i++] = int(vcal * conversionFactor + offset);
     }
   }

diff --git a/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationService.cc b/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationService.cc
@@ -28,8 +28,7 @@ void SiPixelGainCalibrationServiceBase::calibrate(
     else {
       float DBgain = getGain(detID, col, row);
       float DBpedestal = getPedestal(detID, col, row) * DBgain;
-      float vcal = di->adc() * DBgain - DBpedestal;
-      //    float vcal = (di->adc()  - DBpedestal) * DBgain;
+      float vcal = float(di->adc()) * DBgain - DBpedestal;
       electron[i++] = int(vcal * conversionFactor + offset);
     }
   }

diff --git a/CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h b/CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h
@@ -107,8 +107,8 @@ class SiPixelGainCalibrationForHLT {
 private:
   float encodeGain(const float& gain);
   float encodePed(const float& ped);
-  float decodeGain(unsigned int gain) const { return gain * gainPrecision + minGain_; }
-  float decodePed(unsigned int ped) const { return ped * pedPrecision + minPed_; }
+  float decodeGain(unsigned int gain) const { return float(gain) * gainPrecision + minGain_; }
+  float decodePed(unsigned int ped) const { return float(ped) * pedPrecision + minPed_; }
 
   std::vector<char> v_pedestals;  //@@@ blob streaming doesn't work with uint16_t and with classes
   std::vector<DetRegistry> indexes;

diff --git a/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h b/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h
@@ -56,8 +56,8 @@ class SiPixelGainForHLTonGPU {
     return std::make_pair(decodePed(s.ped & 0xFF), decodeGain(s.gain & 0xFF));
   }
 
-  constexpr float decodeGain(unsigned int gain) const { return gain * gainPrecision_ + minGain_; }
-  constexpr float decodePed(unsigned int ped) const { return ped * pedPrecision_ + minPed_; }
+  constexpr float decodeGain(unsigned int gain) const { return float(gain) * gainPrecision_ + minGain_; }
+  constexpr float decodePed(unsigned int ped) const { return float(ped) * pedPrecision_ + minPed_; }
 
   DecodingStructure* v_pedestals_;
   std::pair<Range, int> rangeAndCols_[gpuClustering::maxNumModules];

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -582,16 +582,27 @@ namespace pixelgpudetails {
       int blocks =
           (std::max(int(wordCounter), int(gpuClustering::maxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;
 
-      gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
-                                                                        digis_d.moduleInd(),
-                                                                        digis_d.xx(),
-                                                                        digis_d.yy(),
-                                                                        digis_d.adc(),
-                                                                        gains,
-                                                                        wordCounter,
-                                                                        clusters_d.moduleStart(),
-                                                                        clusters_d.clusInModule(),
-                                                                        clusters_d.clusModuleStart());
+      if (isRun2)
+        gpuCalibPixel::calibDigis<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
+                                                                                digis_d.xx(),
+                                                                                digis_d.yy(),
+                                                                                digis_d.adc(),
+                                                                                gains,
+                                                                                wordCounter,
+                                                                                clusters_d.moduleStart(),
+                                                                                clusters_d.clusInModule(),
+                                                                                clusters_d.clusModuleStart());
+      else
+        gpuCalibPixel::calibDigis<false><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
+                                                                                 digis_d.xx(),
+                                                                                 digis_d.yy(),
+                                                                                 digis_d.adc(),
+                                                                                 gains,
+                                                                                 wordCounter,
+                                                                                 clusters_d.moduleStart(),
+                                                                                 clusters_d.clusInModule(),
+                                                                                 clusters_d.clusModuleStart());
+
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
       cudaDeviceSynchronize();
@@ -607,7 +618,7 @@ namespace pixelgpudetails {
           digis_d.moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
-      threadsPerBlock = 256;
+      threadsPerBlock = 256 + 128;  /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel)
       blocks = maxNumModules;
 #ifdef GPU_DEBUG
       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
@@ -18,8 +18,8 @@ namespace gpuCalibPixel {
   constexpr float VCaltoElectronOffset = -60;      // L2-4: -60 +- 130
   constexpr float VCaltoElectronOffset_L1 = -670;  // L1:   -670 +- 220
 
-  __global__ void calibDigis(bool isRun2,
-                             uint16_t* id,
+  template <bool isRun2>
+  __global__ void calibDigis(uint16_t* id,
                              uint16_t const* __restrict__ x,
                              uint16_t const* __restrict__ y,
                              uint16_t* adc,
@@ -42,9 +42,6 @@ namespace gpuCalibPixel {
       if (invalidModuleId == id[i])
         continue;
 
-      float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f;
-      float offset = (isRun2) ? (id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset) : 0;
-
       bool isDeadColumn = false, isNoisyColumn = false;
 
       int row = x[i];
@@ -58,8 +55,13 @@ namespace gpuCalibPixel {
         id[i] = invalidModuleId;
         adc[i] = 0;
       } else {
-        float vcal = adc[i] * gain - pedestal * gain;
-        adc[i] = std::max(100, int(vcal * conversionFactor + offset));
+        float vcal = float(adc[i]) * gain - pedestal * gain;
+        if constexpr (isRun2) {
+          float conversionFactor = id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
+          float offset = id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
+          vcal = vcal * conversionFactor + offset;
+        }
+        adc[i] = std::max(100, int(vcal));
       }
     }
   }

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -87,43 +87,40 @@ namespace gpuClustering {
 
       auto chargeCut =
           clusterThresholds.getThresholdForLayerOnCondition(thisModuleId < phase1PixelTopology::layerStart[1]);
+
+      bool good = true;
       for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
         newclusId[i] = ok[i] = charge[i] >= chargeCut ? 1 : 0;
+        if (0 == ok[i])
+          good = false;
       }
 
-      __syncthreads();
+      // if all clusters above threshold do nothing
+      if (__syncthreads_and(good))
+        continue;
 
       // renumber
       __shared__ uint16_t ws[32];
       cms::cuda::blockPrefixScan(newclusId, nclus, ws);
 
-      assert(nclus >= newclusId[nclus - 1]);
-
-      if (nclus == newclusId[nclus - 1])
-        continue;
+      assert(nclus > newclusId[nclus - 1]);
 
       nClustersInModule[thisModuleId] = newclusId[nclus - 1];
-      __syncthreads();
-
-      // mark bad cluster again
-      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-        if (0 == ok[i])
-          newclusId[i] = invalidModuleId + 1;
-      }
-      __syncthreads();
 
       // reassign id
       for (auto i = first; i < numElements; i += blockDim.x) {
         if (id[i] == invalidModuleId)
           continue;  // not valid
         if (id[i] != thisModuleId)
           break;  // end of module
-        clusterId[i] = newclusId[clusterId[i]] - 1;
-        if (clusterId[i] == invalidModuleId)
-          id[i] = invalidModuleId;
+        if (0 == ok[clusterId[i]])
+          clusterId[i] = id[i] = invalidModuleId;
+        else
+          clusterId[i] = newclusId[clusterId[i]] - 1;
       }
 
       //done
+      __syncthreads();
     }  // loop on modules
   }
 

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -135,6 +135,11 @@ namespace gpuClustering {
 #ifdef __CUDA_ARCH__
       // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
       constexpr int maxiter = 16;
+      if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
+        printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
+               thisModuleId,
+               hist.size(),
+               blockDim.x);
 #else
       auto maxiter = hist.size();
 #endif

diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
@@ -369,11 +369,11 @@ namespace pixelCPEforGPU {
     auto xoff = -float(phase1PixelTopology::xOffset) * comParams.thePitchX;
     int low_value = 0;
     int high_value = CPEFastParametrisation::kNumErrorBins - 1;
-    int bin_value = float(CPEFastParametrisation::kNumErrorBins) * (cp.xpos[ic] + xoff) / (2 * xoff);
+    int bin_value = float(CPEFastParametrisation::kNumErrorBins) * (cp.xpos[ic] + xoff) / (2.f * xoff);
     // return estimated bin value truncated to [0, 15]
     int jx = std::clamp(bin_value, low_value, high_value);
 
-    auto toCM = [](uint8_t x) { return float(x) * 1.e-4; };
+    auto toCM = [](uint8_t x) { return float(x) * 1.e-4f; };
 
     if (not isEdgeX) {
       cp.xerr[ic] = isOneX ? toCM(isBigX ? detParams.sx2 : detParams.sigmax1[jx])

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -1,5 +1,13 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 
+#include <mutex>
+
+namespace {
+  // cuda atomics are NOT atomics on CPU so protect stat update with a mutex
+  // waiting for a more general solution (incuding multiple devices) to be proposed and implemented
+  std::mutex lock_stat;
+}  // namespace
+
 template <>
 void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
   kernel_printCounters(counters);
@@ -134,25 +142,12 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
   if (nhits > 1 && params_.lateFishbone_) {
     gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
   }
-
-  if (params_.doStats_) {
-    kernel_checkOverflows(tuples_d,
-                          device_tupleMultiplicity_.get(),
-                          device_hitToTuple_.get(),
-                          device_hitTuple_apc_,
-                          device_theCells_.get(),
-                          device_nCells_,
-                          device_theCellNeighbors_.get(),
-                          device_theCellTracks_.get(),
-                          isOuterHitOfCell_,
-                          nhits,
-                          params_.maxNumberOfDoublets_,
-                          counters_);
-  }
 }
 
 template <>
 void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
+  int32_t nhits = hh.nHits();
+
   auto const *tuples_d = &tracks_d->hitIndices;
   auto *quality_d = tracks_d->qualityData();
 
@@ -209,8 +204,26 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
                             device_hitToTuple_.get());
     }
   }
+
+  if (params_.doStats_) {
+    std::lock_guard guard(lock_stat);
+    kernel_checkOverflows(tuples_d,
+                          device_tupleMultiplicity_.get(),
+                          device_hitToTuple_.get(),
+                          device_hitTuple_apc_,
+                          device_theCells_.get(),
+                          device_nCells_,
+                          device_theCellNeighbors_.get(),
+                          device_theCellTracks_.get(),
+                          isOuterHitOfCell_,
+                          nhits,
+                          params_.maxNumberOfDoublets_,
+                          counters_);
+  }
+
   if (params_.doStats_) {
     // counters (add flag???)
+    std::lock_guard guard(lock_stat);
     kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
     kernel_doStatsForTracks(tuples_d, quality_d, counters_);
   }

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -107,9 +107,9 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
       printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
     if (thisCell.isKilled())
       atomicAdd(&c.nKilledCells, 1);
-    if (thisCell.unused())
+    if (!thisCell.unused())
       atomicAdd(&c.nEmptyCells, 1);
-    if (0 == hitToTuple->size(thisCell.inner_hit_id()) && 0 == hitToTuple->size(thisCell.outer_hit_id()))
+    if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
       atomicAdd(&c.nZeroTrackCells, 1);
   }
 
@@ -896,7 +896,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
       "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | nUsedHits | "
       "nDupHits | "
       "nKilledCells | "
-      "nEmptyCells | nZeroTrackCells ||\n");
+      "nUsedCells | nZeroTrackCells ||\n");
   printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
          c.nEvents,
          c.nHits,
@@ -910,7 +910,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
          c.nKilledCells,
          c.nEmptyCells,
          c.nZeroTrackCells);
-  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
+  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  %.3f||\n",
          c.nEvents,
          c.nHits / double(c.nEvents),
          c.nCells / double(c.nEvents),
@@ -920,7 +920,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
          c.nGoodTracks / double(c.nEvents),
          c.nUsedHits / double(c.nEvents),
          c.nDupHits / double(c.nEvents),
-         c.nKilledCells / double(c.nEvents),
+         c.nKilledCells / double(c.nCells),
          c.nEmptyCells / double(c.nCells),
          c.nZeroTrackCells / double(c.nCells));
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -331,7 +331,7 @@ class GPUCACell {
 
   __device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; }
 
-  __device__ __forceinline__ bool unused() const { return !theUsed_; }
+  __device__ __forceinline__ bool unused() const { return 0 == theUsed_; }
   __device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; }
 
 private: