Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Various Patatrack kernels #35835

Merged
merged 14 commits into from Oct 27, 2021
Expand Up @@ -38,8 +38,7 @@ void SiPixelGainCalibrationForHLTService::calibrate(
if (isDeadColumn | isNoisyColumn)
electron[i++] = 0;
else {
float vcal = di->adc() * gain - pedestal * gain;
// float vcal = (di->adc() - DBpedestal) * DBgain;
float vcal = float(di->adc()) * gain - pedestal * gain;
electron[i++] = int(vcal * conversionFactor + offset);
}
}
Expand Down
Expand Up @@ -28,8 +28,7 @@ void SiPixelGainCalibrationServiceBase::calibrate(
else {
float DBgain = getGain(detID, col, row);
float DBpedestal = getPedestal(detID, col, row) * DBgain;
float vcal = di->adc() * DBgain - DBpedestal;
// float vcal = (di->adc() - DBpedestal) * DBgain;
float vcal = float(di->adc()) * DBgain - DBpedestal;
electron[i++] = int(vcal * conversionFactor + offset);
}
}
Expand Down
Expand Up @@ -107,8 +107,8 @@ class SiPixelGainCalibrationForHLT {
private:
float encodeGain(const float& gain);
float encodePed(const float& ped);
float decodeGain(unsigned int gain) const { return gain * gainPrecision + minGain_; }
float decodePed(unsigned int ped) const { return ped * pedPrecision + minPed_; }
float decodeGain(unsigned int gain) const { return float(gain) * gainPrecision + minGain_; }
float decodePed(unsigned int ped) const { return float(ped) * pedPrecision + minPed_; }

std::vector<char> v_pedestals; //@@@ blob streaming doesn't work with uint16_t and with classes
std::vector<DetRegistry> indexes;
Expand Down
4 changes: 2 additions & 2 deletions CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h
Expand Up @@ -56,8 +56,8 @@ class SiPixelGainForHLTonGPU {
return std::make_pair(decodePed(s.ped & 0xFF), decodeGain(s.gain & 0xFF));
}

constexpr float decodeGain(unsigned int gain) const { return gain * gainPrecision_ + minGain_; }
constexpr float decodePed(unsigned int ped) const { return ped * pedPrecision_ + minPed_; }
constexpr float decodeGain(unsigned int gain) const { return float(gain) * gainPrecision_ + minGain_; }
constexpr float decodePed(unsigned int ped) const { return float(ped) * pedPrecision_ + minPed_; }

DecodingStructure* v_pedestals_;
std::pair<Range, int> rangeAndCols_[gpuClustering::maxNumModules];
Expand Down
Expand Up @@ -582,16 +582,27 @@ namespace pixelgpudetails {
int blocks =
(std::max(int(wordCounter), int(gpuClustering::maxNumModules)) + threadsPerBlock - 1) / threadsPerBlock;

gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, stream>>>(isRun2,
digis_d.moduleInd(),
digis_d.xx(),
digis_d.yy(),
digis_d.adc(),
gains,
wordCounter,
clusters_d.moduleStart(),
clusters_d.clusInModule(),
clusters_d.clusModuleStart());
if (isRun2)
gpuCalibPixel::calibDigis<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
digis_d.xx(),
digis_d.yy(),
digis_d.adc(),
gains,
wordCounter,
clusters_d.moduleStart(),
clusters_d.clusInModule(),
clusters_d.clusModuleStart());
else
gpuCalibPixel::calibDigis<false><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.moduleInd(),
digis_d.xx(),
digis_d.yy(),
digis_d.adc(),
gains,
wordCounter,
clusters_d.moduleStart(),
clusters_d.clusInModule(),
clusters_d.clusModuleStart());

cudaCheck(cudaGetLastError());
#ifdef GPU_DEBUG
cudaDeviceSynchronize();
Expand All @@ -607,7 +618,7 @@ namespace pixelgpudetails {
digis_d.moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
cudaCheck(cudaGetLastError());

threadsPerBlock = 256;
threadsPerBlock = 256 + 128; /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel)
blocks = maxNumModules;
#ifdef GPU_DEBUG
std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
Expand Down
16 changes: 9 additions & 7 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h
Expand Up @@ -18,8 +18,8 @@ namespace gpuCalibPixel {
constexpr float VCaltoElectronOffset = -60; // L2-4: -60 +- 130
constexpr float VCaltoElectronOffset_L1 = -670; // L1: -670 +- 220

__global__ void calibDigis(bool isRun2,
uint16_t* id,
template <bool isRun2>
__global__ void calibDigis(uint16_t* id,
uint16_t const* __restrict__ x,
uint16_t const* __restrict__ y,
uint16_t* adc,
Expand All @@ -42,9 +42,6 @@ namespace gpuCalibPixel {
if (invalidModuleId == id[i])
continue;

float conversionFactor = (isRun2) ? (id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain) : 1.f;
float offset = (isRun2) ? (id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset) : 0;

bool isDeadColumn = false, isNoisyColumn = false;

int row = x[i];
Expand All @@ -58,8 +55,13 @@ namespace gpuCalibPixel {
id[i] = invalidModuleId;
adc[i] = 0;
} else {
float vcal = adc[i] * gain - pedestal * gain;
adc[i] = std::max(100, int(vcal * conversionFactor + offset));
float vcal = float(adc[i]) * gain - pedestal * gain;
if constexpr (isRun2) {
float conversionFactor = id[i] < 96 ? VCaltoElectronGain_L1 : VCaltoElectronGain;
float offset = id[i] < 96 ? VCaltoElectronOffset_L1 : VCaltoElectronOffset;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

96 -> phase1PixelTopology::layerStart[1] ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a issue open to fix this everywhere (phase1PixelTopology::layerStart[1] may not compile, or compile and produce wrong results)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not compile and/or produce wrong results? Can you point me to the issue, (I might have missed it)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for record (thanks @VinInn ): #35370

vcal = vcal * conversionFactor + offset;
}
adc[i] = std::max(100, int(vcal));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cms-sw/trk-dpg-l2 would be nice if this 100 is taken from the same place as the other famous 100 in the regular clusterizer.

}
}
}
Expand Down
29 changes: 13 additions & 16 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
Expand Up @@ -87,43 +87,40 @@ namespace gpuClustering {

auto chargeCut =
clusterThresholds.getThresholdForLayerOnCondition(thisModuleId < phase1PixelTopology::layerStart[1]);

bool good = true;
for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
newclusId[i] = ok[i] = charge[i] >= chargeCut ? 1 : 0;
if (0 == ok[i])
good = false;
}

__syncthreads();
// if all clusters above threshold do nothing
if (__syncthreads_and(good))
continue;

// renumber
__shared__ uint16_t ws[32];
cms::cuda::blockPrefixScan(newclusId, nclus, ws);

assert(nclus >= newclusId[nclus - 1]);

if (nclus == newclusId[nclus - 1])
continue;
assert(nclus > newclusId[nclus - 1]);

nClustersInModule[thisModuleId] = newclusId[nclus - 1];
__syncthreads();

// mark bad cluster again
for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
if (0 == ok[i])
newclusId[i] = invalidModuleId + 1;
}
__syncthreads();

// reassign id
for (auto i = first; i < numElements; i += blockDim.x) {
if (id[i] == invalidModuleId)
continue; // not valid
if (id[i] != thisModuleId)
break; // end of module
clusterId[i] = newclusId[clusterId[i]] - 1;
if (clusterId[i] == invalidModuleId)
id[i] = invalidModuleId;
if (0 == ok[clusterId[i]])
clusterId[i] = id[i] = invalidModuleId;
else
clusterId[i] = newclusId[clusterId[i]] - 1;
}

//done
__syncthreads();
} // loop on modules
}

Expand Down
5 changes: 5 additions & 0 deletions RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
Expand Up @@ -135,6 +135,11 @@ namespace gpuClustering {
#ifdef __CUDA_ARCH__
// assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
constexpr int maxiter = 16;
if (threadIdx.x == 0 && (hist.size() / blockDim.x) >= maxiter)
printf("THIS IS NOT SUPPOSED TO HAPPEN too many hits in module %d: %d for block size %d\n",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is an assert below but it's compiled away!
We should fix this!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add __trap() here then?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when we wiil have a Heterogeneous version, yes.

thisModuleId,
hist.size(),
blockDim.x);
#else
auto maxiter = hist.size();
#endif
Expand Down
4 changes: 2 additions & 2 deletions RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
Expand Up @@ -369,11 +369,11 @@ namespace pixelCPEforGPU {
auto xoff = -float(phase1PixelTopology::xOffset) * comParams.thePitchX;
int low_value = 0;
int high_value = CPEFastParametrisation::kNumErrorBins - 1;
int bin_value = float(CPEFastParametrisation::kNumErrorBins) * (cp.xpos[ic] + xoff) / (2 * xoff);
int bin_value = float(CPEFastParametrisation::kNumErrorBins) * (cp.xpos[ic] + xoff) / (2.f * xoff);
// return estimated bin value truncated to [0, 15]
int jx = std::clamp(bin_value, low_value, high_value);

auto toCM = [](uint8_t x) { return float(x) * 1.e-4; };
auto toCM = [](uint8_t x) { return float(x) * 1.e-4f; };

if (not isEdgeX) {
cp.xerr[ic] = isOneX ? toCM(isBigX ? detParams.sx2 : detParams.sigmax1[jx])
Expand Down
@@ -1,5 +1,13 @@
#include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h"

#include <mutex>

namespace {
// cuda atomics are NOT atomics on CPU so protect stat update with a mutex
// waiting for a more general solution (incuding multiple devices) to be proposed and implemented
std::mutex lock_stat;
Comment on lines +6 to +8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the cpu case, what is doing concurrent updates of the same stat ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mean gpu?
On gpu atomics on device memory is used. Multiple GPUs will crash

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on cpu (w/o mutex) was obviously producing wrong (lower) results (as expected)

} // namespace

template <>
void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) {
kernel_printCounters(counters);
Expand Down Expand Up @@ -134,25 +142,12 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
if (nhits > 1 && params_.lateFishbone_) {
gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
}

if (params_.doStats_) {
kernel_checkOverflows(tuples_d,
device_tupleMultiplicity_.get(),
device_hitToTuple_.get(),
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
isOuterHitOfCell_,
nhits,
params_.maxNumberOfDoublets_,
counters_);
}
}

template <>
void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) {
int32_t nhits = hh.nHits();

auto const *tuples_d = &tracks_d->hitIndices;
auto *quality_d = tracks_d->qualityData();

Expand Down Expand Up @@ -209,8 +204,26 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA
device_hitToTuple_.get());
}
}

if (params_.doStats_) {
std::lock_guard guard(lock_stat);
kernel_checkOverflows(tuples_d,
device_tupleMultiplicity_.get(),
device_hitToTuple_.get(),
device_hitTuple_apc_,
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
isOuterHitOfCell_,
nhits,
params_.maxNumberOfDoublets_,
counters_);
}

if (params_.doStats_) {
// counters (add flag???)
std::lock_guard guard(lock_stat);
Comment on lines +222 to +226
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need two separate scopes, instead of just one ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

historical

kernel_doStatsForHitInTracks(device_hitToTuple_.get(), counters_);
kernel_doStatsForTracks(tuples_d, quality_d, counters_);
}
Expand Down
Expand Up @@ -107,9 +107,9 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets,
printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
if (thisCell.isKilled())
atomicAdd(&c.nKilledCells, 1);
if (thisCell.unused())
if (!thisCell.unused())
atomicAdd(&c.nEmptyCells, 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't nEmptyCells be renamed to nUsedCells ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I count now "used" because they are much less (so faster). Then in the report I do "1"-.

if (0 == hitToTuple->size(thisCell.inner_hit_id()) && 0 == hitToTuple->size(thisCell.outer_hit_id()))
if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
atomicAdd(&c.nZeroTrackCells, 1);
}

Expand Down Expand Up @@ -896,7 +896,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
"||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | nUsedHits | "
"nDupHits | "
"nKilledCells | "
"nEmptyCells | nZeroTrackCells ||\n");
"nUsedCells | nZeroTrackCells ||\n");
printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
c.nEvents,
c.nHits,
Expand All @@ -910,7 +910,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
c.nKilledCells,
c.nEmptyCells,
c.nZeroTrackCells);
printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f||\n",
printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| %.3f||\n",
c.nEvents,
c.nHits / double(c.nEvents),
c.nCells / double(c.nEvents),
Expand All @@ -920,7 +920,7 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun
c.nGoodTracks / double(c.nEvents),
c.nUsedHits / double(c.nEvents),
c.nDupHits / double(c.nEvents),
c.nKilledCells / double(c.nEvents),
c.nKilledCells / double(c.nCells),
c.nEmptyCells / double(c.nCells),
c.nZeroTrackCells / double(c.nCells));
}
2 changes: 1 addition & 1 deletion RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
Expand Up @@ -331,7 +331,7 @@ class GPUCACell {

__device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; }

__device__ __forceinline__ bool unused() const { return !theUsed_; }
__device__ __forceinline__ bool unused() const { return 0 == theUsed_; }
__device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; }

private:
Expand Down