Skip to content

Commit

Permalink
Merge pull request #37444 from missirol/backport_37281_to_123X
Browse files Browse the repository at this point in the history
Backport to `12_3_X` of Patatrack pixel-reco updates (pT-clamping in vertex sorting + stabilized Fishbone)
  • Loading branch information
cmsbuild committed Apr 5, 2022
2 parents 6e4ac3f + 3c15c2f commit 1ed9513
Show file tree
Hide file tree
Showing 13 changed files with 114 additions and 60 deletions.
6 changes: 6 additions & 0 deletions CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
Expand Up @@ -45,6 +45,9 @@ class TrackSoAHeterogeneousT {

eigenSoA::ScalarSoA<int8_t, S> nLayers;

constexpr int nTracks() const { return nTracks_; }
constexpr void setNTracks(int n) { nTracks_ = n; }

constexpr int nHits(int i) const { return detIndices.size(i); }

constexpr bool isTriplet(int i) const { return nLayers(i) == 3; }
Expand Down Expand Up @@ -80,6 +83,9 @@ class TrackSoAHeterogeneousT {

HitContainer hitIndices;
HitContainer detIndices;

private:
int nTracks_;
};

namespace pixelTrack {
Expand Down
Expand Up @@ -154,16 +154,15 @@ void PixelTrackProducerFromSoA::produce(edm::StreamID streamID,
auto const *quality = tsoa.qualityData();
auto const &fit = tsoa.stateAtBS;
auto const &hitIndices = tsoa.hitIndices;
auto maxTracks = tsoa.stride();
auto nTracks = tsoa.nTracks();

tracks.reserve(maxTracks);
tracks.reserve(nTracks);

int32_t nt = 0;

for (int32_t it = 0; it < maxTracks; ++it) {
for (int32_t it = 0; it < nTracks; ++it) {
auto nHits = tsoa.nHits(it);
if (nHits == 0)
break; // this is a guard: maybe we need to move to nTracks...
assert(nHits >= 3);
indToEdm.push_back(-1);
auto q = quality[it];
if (q < minQuality_)
Expand Down
Expand Up @@ -17,7 +17,7 @@
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"

// Switch on to enable checks and printout for found tracks
#undef PIXEL_DEBUG_PRODUCE
// #define PIXEL_DEBUG_PRODUCE

class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
public:
Expand Down Expand Up @@ -63,7 +63,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
#ifdef PIXEL_DEBUG_PRODUCE
auto const& tsoa = *soa_;
auto maxTracks = tsoa.stride();
std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
auto nTracks = tsoa.nTracks();
std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
std::cout << "found " << nTracks << " tracks in cpu SoA at " << &tsoa << std::endl;

int32_t nt = 0;
for (int32_t it = 0; it < maxTracks; ++it) {
Expand All @@ -73,7 +75,7 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
break; // this is a guard: maybe we need to move to nTracks...
nt++;
}
std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
assert(nTracks == nt);
#endif

// DO NOT make a copy (actually TWO....)
Expand Down
Expand Up @@ -129,7 +129,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d);

kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d);
kernel_fillNLayers(tracks_d);
kernel_fillNLayers(tracks_d, device_hitTuple_apc_);

// remove duplicates (tracks that share a doublet)
kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_);
Expand Down Expand Up @@ -213,7 +213,11 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA

#ifdef DUMP_GPU_TK_TUPLES
static std::atomic<int> iev(0);
++iev;
kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
static std::mutex lock;
{
std::lock_guard<std::mutex> guard(lock);
++iev;
kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 0, 1000000, iev);
}
#endif
}
@@ -1,4 +1,5 @@
#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"
#include <mutex>

template <>
void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
Expand Down Expand Up @@ -90,7 +91,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *

kernel_fillHitDetIndices<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
cudaCheck(cudaGetLastError());
kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d);
kernel_fillNLayers<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, device_hitTuple_apc_);
cudaCheck(cudaGetLastError());

// remove duplicates (tracks that share a doublet)
Expand Down Expand Up @@ -330,9 +331,20 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA

#ifdef DUMP_GPU_TK_TUPLES
static std::atomic<int> iev(0);
++iev;
kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev);
static std::mutex lock;
{
std::lock_guard<std::mutex> guard(lock);
++iev;
for (int k = 0; k < 20000; k += 500) {
kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), k, k + 500, iev);
cudaDeviceSynchronize();
}
kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(
hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 20000, 1000000, iev);
cudaDeviceSynchronize();
// cudaStreamSynchronize(cudaStream);
}
#endif
}

Expand Down
Expand Up @@ -21,6 +21,7 @@ namespace cAHitNtupletGenerator {
unsigned long long nGoodTracks;
unsigned long long nUsedHits;
unsigned long long nDupHits;
unsigned long long nFishCells;
unsigned long long nKilledCells;
unsigned long long nEmptyCells;
unsigned long long nZeroTrackCells;
Expand Down
Expand Up @@ -101,6 +101,8 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,

for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
auto const &thisCell = cells[idx];
if (thisCell.hasFishbone() && !thisCell.isKilled())
atomicAdd(&c.nFishCells, 1);
if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId];
printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId];
Expand Down Expand Up @@ -194,7 +196,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,

/* chi2 penalize higher-pt tracks (try rescale it?)
auto score = [&](auto it) {
return tracks->nHits(it) < 4 ?
return tracks->nLayers(it) < 4 ?
std::abs(tracks->tip(it)) : // tip for triplets
tracks->chi2(it); //chi2 for quads
};
Expand All @@ -204,7 +206,7 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells,

// full crazy combinatorics
int ntr = thisCell.tracks().size();
for (int i = 0; i < ntr; ++i) {
for (int i = 0; i < ntr - 1; ++i) {
auto it = thisCell.tracks()[i];
auto qi = tracks->quality(it);
if (qi <= reject)
Expand Down Expand Up @@ -557,13 +559,15 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples
}
}

__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks) {
__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
auto &tracks = *ptracks;
auto first = blockIdx.x * blockDim.x + threadIdx.x;
for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
auto ntracks = apc->get().m;
if (0 == first)
tracks.setNTracks(ntracks);
for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
auto nHits = tracks.nHits(idx);
if (nHits == 0)
break; // this is a guard: maybe we need to move to nTracks...
assert(nHits >= 3);
tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
}
}
Expand Down Expand Up @@ -666,7 +670,7 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };

// full combinatorics
for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
auto const it = *ip;
auto qi = quality[it];
if (qi <= reject)
Expand All @@ -676,7 +680,7 @@ __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks,
auto cti = tracks.stateAtBS.state(it)(3);
auto e2cti = tracks.stateAtBS.covariance(it)(12);
auto nli = tracks.nLayers(it);
for (auto jp = ip + 1; jp != hitToTuple.end(idx); ++jp) {
for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
auto const jt = *jp;
auto qj = quality[jt];
if (qj <= reject)
Expand Down Expand Up @@ -860,19 +864,25 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
TkSoA const *__restrict__ ptracks,
Quality const *__restrict__ quality,
CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple,
int32_t maxPrint,
int32_t firstPrint,
int32_t lastPrint,
int iev) {
constexpr auto loose = pixelTrack::Quality::loose;
auto const &hh = *hhp;
auto const &foundNtuplets = *ptuples;
auto const &tracks = *ptracks;
int first = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = first, np = std::min(maxPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
auto nh = foundNtuplets.size(i);
if (nh < 3)
continue;
printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
if (quality[i] < loose)
continue;
printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
10000 * iev + i,
int(quality[i]),
nh,
tracks.nLayers(i),
tracks.charge(i),
tracks.pt(i),
tracks.eta(i),
Expand All @@ -881,11 +891,13 @@ __global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__res
tracks.zip(i),
// asinhf(fit_results[i].par(3)),
tracks.chi2(i),
*foundNtuplets.begin(i),
*(foundNtuplets.begin(i) + 1),
*(foundNtuplets.begin(i) + 2),
nh > 3 ? int(*(foundNtuplets.begin(i) + 3)) : -1,
nh > 4 ? int(*(foundNtuplets.begin(i) + 4)) : -1);
hh.zGlobal(*foundNtuplets.begin(i)),
hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
}
}

Expand All @@ -894,22 +906,24 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
printf(
"||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | "
"nDupHits | "
"nFishCells | "
"nKilledCells | "
"nUsedCells | nZeroTrackCells ||\n");
printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
c.nEvents,
c.nHits,
c.nCells,
c.nTuples,
c.nFitTracks,
c.nLooseTracks,
c.nGoodTracks,
c.nFitTracks,
c.nUsedHits,
c.nDupHits,
c.nFishCells,
c.nKilledCells,
c.nEmptyCells,
c.nZeroTrackCells);
printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f||\n",
printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f| %.3f||\n",
c.nEvents,
c.nHits / double(c.nEvents),
c.nCells / double(c.nEvents),
Expand All @@ -919,6 +933,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
c.nGoodTracks / double(c.nEvents),
c.nUsedHits / double(c.nEvents),
c.nDupHits / double(c.nEvents),
c.nFishCells / double(c.nCells),
c.nKilledCells / double(c.nCells),
c.nEmptyCells / double(c.nCells),
c.nZeroTrackCells / double(c.nCells));
Expand Down
Expand Up @@ -83,10 +83,11 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
cfg.getParameter<double>("dcaCutOuterTriplet"),
makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
#ifdef DUMP_GPU_TK_TUPLES
printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
printf("TK: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
"tid",
"qual",
"nh",
"nl",
"charge",
"pt",
"eta",
Expand All @@ -98,7 +99,8 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet&
"h2",
"h3",
"h4",
"h5");
"h5",
"hn");
#endif

if (m_params.onGPU_) {
Expand Down
18 changes: 13 additions & 5 deletions RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
Expand Up @@ -22,6 +22,7 @@ class GPUCACell {
using PtrAsInt = unsigned long long;

static constexpr auto maxCellsPerHit = caConstants::maxCellsPerHit;
static constexpr auto invalidHitId = std::numeric_limits<caConstants::hindex_type>::max();
using OuterHitOfCellContainer = caConstants::OuterHitOfCellContainer;
using OuterHitOfCell = caConstants::OuterHitOfCell;
using CellNeighbors = caConstants::CellNeighbors;
Expand Down Expand Up @@ -52,7 +53,7 @@ class GPUCACell {
theOuterHitId = outerHitId;
theLayerPairId_ = layerPairId;
theStatus_ = 0;
theFishboneId = std::numeric_limits<hindex_type>::max();
theFishboneId = invalidHitId;

// optimization that depends on access pattern
theInnerZ = hh.zGlobal(innerHitId);
Expand Down Expand Up @@ -342,11 +343,18 @@ class GPUCACell {
__device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); }
__device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); }

__device__ __forceinline__ void setFishbone(hindex_type id) { theFishboneId = id; }
__device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
__device__ __forceinline__ bool hasFishbone() const {
return theFishboneId != std::numeric_limits<hindex_type>::max();
__device__ __forceinline__ void setFishbone(hindex_type id, float z, Hits const& hh) {
// make it deterministic: use the farther apart (in z)
auto old = theFishboneId;
while (
old !=
atomicCAS(&theFishboneId,
old,
(invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh.zGlobal(old) - theInnerZ)) ? id : old))
old = theFishboneId;
}
__device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
__device__ __forceinline__ bool hasFishbone() const { return theFishboneId != invalidHitId; }

private:
CellNeighbors* theOuterNeighbors;
Expand Down
10 changes: 5 additions & 5 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
Expand Up @@ -75,25 +75,25 @@ namespace gpuPixelDoublets {
// must be different detectors
// if (d[ic]==d[jc]) continue;
auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) {
if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * (n[ic] * n[jc])) {
// alligned: kill farthest (prefer consecutive layers)
// if same layer prefer farthest (longer level arm) and make space for intermediate hit
bool sameLayer = l[ic] == l[jc];
if (n[ic] > n[jc]) {
if (sameLayer) {
cj.kill(); // closest
ci.setFishbone(cj.inner_hit_id());
ci.setFishbone(cj.inner_hit_id(), cj.inner_z(hh), hh);
} else {
ci.kill(); // farthest
break;
// break; // removed to improve reproducibility. keep it for reference and tests
}
} else {
if (!sameLayer) {
cj.kill(); // farthest
} else {
ci.kill(); // closest
cj.setFishbone(ci.inner_hit_id());
break;
cj.setFishbone(ci.inner_hit_id(), ci.inner_z(hh), hh);
// break; // removed to improve reproducibility. keep it for reference and tests
}
}
}
Expand Down

0 comments on commit 1ed9513

Please sign in to comment.