Skip to content

Commit

Permalink
Merge pull request #35285 from VinInn/OuterHitOfCell
Browse files Browse the repository at this point in the history
reduce size of isOuterHitOfCell in patatrack
  • Loading branch information
cmsbuild committed Oct 1, 2021
2 parents 00f808e + 592ffaa commit 7e79ae6
Show file tree
Hide file tree
Showing 17 changed files with 94 additions and 59 deletions.
Expand Up @@ -18,9 +18,13 @@ class SiPixelClustersCUDA {
SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default;
SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default;

void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; }
void setNClusters(uint32_t nClusters, int32_t offsetBPIX2) {
nClusters_h = nClusters;
offsetBPIX2_h = offsetBPIX2;
}

uint32_t nClusters() const { return nClusters_h; }
int32_t offsetBPIX2() const { return offsetBPIX2_h; }

uint32_t *moduleStart() { return moduleStart_d.get(); }
uint32_t *clusInModule() { return clusInModule_d.get(); }
Expand Down Expand Up @@ -58,6 +62,7 @@ class SiPixelClustersCUDA {
cms::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer

uint32_t nClusters_h = 0;
int32_t offsetBPIX2_h = 0;
};

#endif // CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
Expand Up @@ -16,6 +16,7 @@ class TrackingRecHit2DHeterogeneous {

explicit TrackingRecHit2DHeterogeneous(
uint32_t nHits,
int32_t offsetBPIX2,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
Expand All @@ -32,6 +33,7 @@ class TrackingRecHit2DHeterogeneous {
TrackingRecHit2DSOAView const* view() const { return m_view.get(); }

auto nHits() const { return m_nHits; }
auto offsetBPIX2() const { return m_offsetBPIX2; }

auto hitsModuleStart() const { return m_hitsModuleStart; }
auto hitsLayerStart() { return m_hitsLayerStart; }
Expand Down Expand Up @@ -60,6 +62,7 @@ class TrackingRecHit2DHeterogeneous {
unique_ptr<TrackingRecHit2DSOAView> m_view; //!

uint32_t m_nHits;
int32_t m_offsetBPIX2;

uint32_t const* m_hitsModuleStart; // needed for legacy, this is on GPU!

Expand All @@ -80,11 +83,12 @@ using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::Host
template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
uint32_t nHits,
int32_t offsetBPIX2,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
: m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
: m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;
Expand Down
Expand Up @@ -18,11 +18,11 @@ int main() {
auto nHits = 200;
// inner scope to deallocate memory before destroying the stream
{
TrackingRecHit2DGPU tkhit(nHits, nullptr, nullptr, stream);
TrackingRecHit2DGPU tkhit(nHits, 0, nullptr, nullptr, stream);

testTrackingRecHit2D::runKernels(tkhit.view());

TrackingRecHit2DHost tkhitH(nHits, nullptr, nullptr, stream, &tkhit);
TrackingRecHit2DHost tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit);
cudaStreamSynchronize(stream);
assert(tkhitH.view());
assert(tkhitH.view()->nHits() == unsigned(nHits));
Expand Down
Expand Up @@ -452,7 +452,10 @@ namespace pixelgpudetails {

} // end of Raw to Digi kernel

__global__ void fillHitsModuleStart(uint32_t const *__restrict__ cluStart, uint32_t *__restrict__ moduleStart) {
__global__ void fillHitsModuleStart(uint32_t const *__restrict__ clusInModule,
uint32_t *__restrict__ moduleStart,
uint32_t const *__restrict__ nModules,
uint32_t *__restrict__ nModules_Clusters) {
assert(gpuClustering::maxNumModules < 2048); // easy to extend at least till 32*1024
assert(1 == gridDim.x);
assert(0 == blockIdx.x);
Expand All @@ -461,7 +464,7 @@ namespace pixelgpudetails {

// limit to maxHitsInModule()
for (int i = first, iend = gpuClustering::maxNumModules; i < iend; i += blockDim.x) {
moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), cluStart[i]);
moduleStart[i + 1] = std::min(gpuClustering::maxHitsInModule(), clusInModule[i]);
}

__shared__ uint32_t ws[32];
Expand All @@ -473,9 +476,18 @@ namespace pixelgpudetails {
}
__syncthreads();

if (threadIdx.x == 0) {
// copy the number of modules
nModules_Clusters[0] = *nModules;
// last element holds the number of all clusters
nModules_Clusters[1] = moduleStart[gpuClustering::maxNumModules];
// element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
nModules_Clusters[2] = moduleStart[96];
}

#ifdef GPU_DEBUG
assert(0 == moduleStart[0]);
auto c0 = std::min(gpuClustering::maxHitsInModule(), cluStart[0]);
auto c0 = std::min(gpuClustering::maxHitsInModule(), clusInModule[0]);
assert(c0 == moduleStart[1]);
assert(moduleStart[1024] >= moduleStart[1023]);
assert(moduleStart[1025] >= moduleStart[1024]);
Expand Down Expand Up @@ -519,8 +531,6 @@ namespace pixelgpudetails {
}
clusters_d = SiPixelClustersCUDA(gpuClustering::maxNumModules, stream);

nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(2, stream);

if (wordCounter) // protect in case of empty event....
{
const int threadsPerBlock = 512;
Expand Down Expand Up @@ -597,10 +607,6 @@ namespace pixelgpudetails {
digis_d.moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
cudaCheck(cudaGetLastError());

// read the number of modules into a data member, used by getProduct())
cudaCheck(cudaMemcpyAsync(
&(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));

threadsPerBlock = 256;
blocks = maxNumModules;
#ifdef GPU_DEBUG
Expand Down Expand Up @@ -635,16 +641,15 @@ namespace pixelgpudetails {
// rechits) so that the number of clusters/hits can be made
// available in the rechit producer without additional points of
// synchronization/ExternalWork

auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
// MUST be ONE block
fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d.clusInModule(), clusters_d.clusModuleStart());
fillHitsModuleStart<<<1, 1024, 0, stream>>>(
clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get());

// last element holds the number of all clusters
cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]),
clusters_d.clusModuleStart() + gpuClustering::maxNumModules,
sizeof(uint32_t),
cudaMemcpyDefault,
stream));
// copy to host
nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
cudaCheck(cudaMemcpyAsync(
nModules_Clusters_h.get(), nModules_Clusters_d.get(), 3 * sizeof(uint32_t), cudaMemcpyDefault, stream));

#ifdef GPU_DEBUG
cudaDeviceSynchronize();
Expand Down
Expand Up @@ -114,7 +114,8 @@ namespace pixelgpudetails {

std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
clusters_d.setNClusters(nModules_Clusters_h[1]);
assert(nModules_Clusters_h[2] <= nModules_Clusters_h[1]);
clusters_d.setNClusters(nModules_Clusters_h[1], nModules_Clusters_h[2]);
// need to explicitly deallocate while the associated CUDA
// stream is still alive
//
Expand Down
Expand Up @@ -38,7 +38,7 @@ namespace pixelgpudetails {
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
cudaStream_t stream) const {
auto nHits = clusters_d.nClusters();
TrackingRecHit2DGPU hits_d(nHits, cpeParams, clusters_d.clusModuleStart(), stream);
TrackingRecHit2DGPU hits_d(nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);

int threadsPerBlock = 128;
int blocks = digis_d.nModules(); // active modules (with digis)
Expand Down
Expand Up @@ -141,7 +141,9 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
assert(numberOfClusters == int(hitsModuleStart[gpuClustering::maxNumModules]));

// output SoA
auto output = std::make_unique<TrackingRecHit2DCPU>(numberOfClusters, &cpeView, hitsModuleStart, nullptr);
// element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
auto output =
std::make_unique<TrackingRecHit2DCPU>(numberOfClusters, hitsModuleStart[96], &cpeView, hitsModuleStart, nullptr);

if (0 == numberOfClusters) {
iEvent.put(std::move(output));
Expand Down
9 changes: 8 additions & 1 deletion RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
Expand Up @@ -73,11 +73,18 @@ namespace caConstants {
using CellNeighborsVector = cms::cuda::SimpleVector<CellNeighbors>;
using CellTracksVector = cms::cuda::SimpleVector<CellTracks>;

using OuterHitOfCell = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
using OuterHitOfCellContainer = cms::cuda::VecArray<uint32_t, maxCellsPerHit>;
using TuplesContainer = cms::cuda::OneToManyAssoc<hindex_type, maxTuples, 5 * maxTuples>;
using HitToTuple = cms::cuda::OneToManyAssoc<tindex_type, -1, 4 * maxTuples>; // 3.5 should be enough
using TupleMultiplicity = cms::cuda::OneToManyAssoc<tindex_type, 8, maxTuples>;

struct OuterHitOfCell {
OuterHitOfCellContainer* container;
int32_t offset;
constexpr auto& operator[](int i) { return container[i - offset]; }
constexpr auto const& operator[](int i) const { return container[i - offset]; }
};

} // namespace caConstants

#endif // RecoPixelVertexing_PixelTriplets_plugins_CAConstants_h
Expand Up @@ -15,15 +15,16 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
auto nhits = hh.nHits();

#ifdef NTUPLE_DEBUG
std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl;
#endif

// use "nhits" to heuristically dimension the workspace

// no need to use the Traits allocations, since we know this is being compiled for the CPU
//device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits));
device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
assert(device_isOuterHitOfCell_.get());
isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};

auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
Expand All @@ -34,7 +35,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
sizeof(GPUCACell::CellNeighbors));

gpuPixelDoublets::initDoublets(device_isOuterHitOfCell_.get(),
gpuPixelDoublets::initDoublets(isOuterHitOfCell_,
nhits,
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
Expand Down Expand Up @@ -64,7 +65,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
isOuterHitOfCell_,
nActualPairs,
params_.idealConditions_,
params_.doClusterCut_,
Expand Down Expand Up @@ -98,7 +99,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
isOuterHitOfCell_,
params_.hardCurvCut_,
params_.ptmin_,
params_.CAThetaCutBarrel_,
Expand All @@ -107,8 +108,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
params_.dcaCutOuterTriplet_);

if (nhits > 1 && params_.earlyFishbone_) {
gpuPixelDoublets::fishbone(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
}

kernel_find_ntuplets(hh.view(),
Expand All @@ -132,8 +132,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
kernel_fillMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get());

if (nhits > 1 && params_.lateFishbone_) {
gpuPixelDoublets::fishbone(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
gpuPixelDoublets::fishbone(hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
}

if (params_.doStats_) {
Expand All @@ -145,7 +144,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_nCells_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
isOuterHitOfCell_,
nhits,
params_.maxNumberOfDoublets_,
counters_);
Expand Down
Expand Up @@ -54,7 +54,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
device_theCells_.get(),
device_nCells_,
device_theCellNeighbors_.get(),
device_isOuterHitOfCell_.get(),
isOuterHitOfCell_,
params_.hardCurvCut_,
params_.ptmin_,
params_.CAThetaCutBarrel_,
Expand All @@ -67,11 +67,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
auto nthTot = 128;
auto stride = 16;
auto blockSize = nthTot / stride;
auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
dim3 blks(1, numberOfBlocks, 1);
dim3 thrs(stride, blockSize, 1);
gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, false);
cudaCheck(cudaGetLastError());
}

Expand Down Expand Up @@ -119,11 +119,11 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *
auto nthTot = 128;
auto stride = 16;
auto blockSize = nthTot / stride;
auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
auto numberOfBlocks = (nhits - isOuterHitOfCell_.offset + blockSize - 1) / blockSize;
dim3 blks(1, numberOfBlocks, 1);
dim3 thrs(stride, blockSize, 1);
gpuPixelDoublets::fishbone<<<blks, thrs, 0, cudaStream>>>(
hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
hh.view(), device_theCells_.get(), device_nCells_, isOuterHitOfCell_, nhits, true);
cudaCheck(cudaGetLastError());
}

Expand All @@ -140,6 +140,8 @@ template <>
void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
int32_t nhits = hh.nHits();

isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};

#ifdef NTUPLE_DEBUG
std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
#endif
Expand All @@ -150,9 +152,12 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
#endif

// in principle we can use "nhits" to heuristically dimension the workspace...
device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1, nhits), stream);
device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCellContainer[]>(
std::max(1, nhits - hh.offsetBPIX2()), stream);
assert(device_isOuterHitOfCell_.get());

isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};

cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
Expand All @@ -164,8 +169,8 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
{
int threadsPerBlock = 128;
// at least one block!
int blocks = (std::max(1, nhits) + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(device_isOuterHitOfCell_.get(),
int blocks = (std::max(1, nhits - hh.offsetBPIX2()) + threadsPerBlock - 1) / threadsPerBlock;
gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream>>>(isOuterHitOfCell_,
nhits,
device_theCellNeighbors_.get(),
device_theCellNeighborsContainer_,
Expand Down Expand Up @@ -206,7 +211,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
hh.view(),
device_isOuterHitOfCell_.get(),
isOuterHitOfCell_,
nActualPairs,
params_.idealConditions_,
params_.doClusterCut_,
Expand Down Expand Up @@ -324,7 +329,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
device_nCells_,
device_theCellNeighbors_.get(),
device_theCellTracks_.get(),
device_isOuterHitOfCell_.get(),
isOuterHitOfCell_,
nhits,
params_.maxNumberOfDoublets_,
counters_);
Expand Down
Expand Up @@ -205,7 +205,8 @@ class CAHitNtupletGeneratorKernels {
caConstants::CellTracks* device_theCellTracksContainer_;

unique_ptr<GPUCACell[]> device_theCells_;
unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
unique_ptr<GPUCACell::OuterHitOfCellContainer[]> device_isOuterHitOfCell_;
GPUCACell::OuterHitOfCell isOuterHitOfCell_;
uint32_t* device_nCells_ = nullptr;

unique_ptr<HitToTuple> device_hitToTuple_;
Expand Down

0 comments on commit 7e79ae6

Please sign in to comment.