Skip to content

Commit

Permalink
Merge pull request #17 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_dev
Browse files Browse the repository at this point in the history
Various updates
  • Loading branch information
hatakeyamak committed Oct 11, 2022
2 parents 9d63cfe + 909ce13 commit 989035a
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 168 deletions.
83 changes: 15 additions & 68 deletions RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h
Expand Up @@ -82,6 +82,8 @@ namespace PFRecHit {
uint32_t nValidBarrelIds;
uint32_t nValidEndcapIds;
uint32_t nValidDetIds;
uint32_t nDenseIdsInRange;
uint32_t denseIdHcalMin;
float qTestThresh;
uint32_t qTestDepthHB[4];
uint32_t qTestDepthHE[7];
Expand Down Expand Up @@ -166,34 +168,6 @@ namespace PFClustering {
uint32_t maxNeighbors = 8;
};

struct InputDataCPU {
cms::cuda::host::unique_ptr<float[]> pfrh_x;
cms::cuda::host::unique_ptr<float[]> pfrh_y;
cms::cuda::host::unique_ptr<float[]> pfrh_z;
cms::cuda::host::unique_ptr<float[]> pfrh_energy;

cms::cuda::host::unique_ptr<int[]> pfrh_layer;
cms::cuda::host::unique_ptr<int[]> pfrh_depth;
cms::cuda::host::unique_ptr<int[]> pfNeighFourInd;
cms::cuda::host::unique_ptr<int[]> pfNeighEightInd;
cms::cuda::host::unique_ptr<int[]> pfrh_edgeId;
cms::cuda::host::unique_ptr<int[]> pfrh_edgeList;

void allocate(ConfigurationParameters const& config, cudaStream_t cudaStream = cudaStreamDefault) {
pfrh_x = cms::cuda::make_host_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfrh_y = cms::cuda::make_host_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfrh_z = cms::cuda::make_host_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfrh_energy = cms::cuda::make_host_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfrh_layer = cms::cuda::make_host_unique<int[]>(sizeof(int) * config.maxRH, cudaStream);
pfrh_depth = cms::cuda::make_host_unique<int[]>(sizeof(int) * config.maxRH, cudaStream);
pfNeighFourInd = cms::cuda::make_host_unique<int[]>(sizeof(int) * config.maxRH * 4, cudaStream);
pfNeighEightInd = cms::cuda::make_host_unique<int[]>(sizeof(int) * config.maxRH * 8, cudaStream);
pfrh_edgeId = cms::cuda::make_host_unique<int[]>(sizeof(int) * config.maxRH * config.maxNeighbors, cudaStream);
pfrh_edgeList =
cms::cuda::make_host_unique<int[]>(sizeof(int) * config.maxRH * config.maxNeighbors, cudaStream);
}
};

struct OutputDataCPU {
cms::cuda::host::unique_ptr<int[]> pfrh_topoId;
cms::cuda::host::unique_ptr<int[]> pfrh_isSeed;
Expand Down Expand Up @@ -234,46 +208,6 @@ namespace PFClustering {
}
};

struct InputDataGPU {
cms::cuda::device::unique_ptr<float[]> pfrh_x;
cms::cuda::device::unique_ptr<float[]> pfrh_y;
cms::cuda::device::unique_ptr<float[]> pfrh_z;
cms::cuda::device::unique_ptr<float[]> pfrh_energy;
cms::cuda::device::unique_ptr<float[]> pcrh_fracSum;

cms::cuda::device::unique_ptr<int[]> pfrh_layer;
cms::cuda::device::unique_ptr<int[]> pfrh_depth;
cms::cuda::device::unique_ptr<int[]> pfNeighFourInd;
cms::cuda::device::unique_ptr<int[]> pfNeighEightInd;
cms::cuda::device::unique_ptr<int[]> pfrh_edgeId;
cms::cuda::device::unique_ptr<int[]> pfrh_edgeList;

cms::cuda::device::unique_ptr<float4[]> pfc_pos4;
cms::cuda::device::unique_ptr<float4[]> pfc_prevPos4;
cms::cuda::device::unique_ptr<float[]> pfc_energy;

void allocate(ConfigurationParameters const& config, cudaStream_t cudaStream = cudaStreamDefault) {
pfrh_x = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfrh_y = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfrh_z = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfrh_energy = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pcrh_fracSum = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);

pfrh_layer = cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH, cudaStream);
pfrh_depth = cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH, cudaStream);
pfNeighFourInd = cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH * 4, cudaStream);
pfNeighEightInd = cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH * 8, cudaStream);
pfrh_edgeId =
cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH * config.maxNeighbors, cudaStream);
pfrh_edgeList =
cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH * config.maxNeighbors, cudaStream);

pfc_pos4 = cms::cuda::make_device_unique<float4[]>(sizeof(float4) * config.maxRH, cudaStream);
pfc_prevPos4 = cms::cuda::make_device_unique<float4[]>(sizeof(float4) * config.maxRH, cudaStream);
pfc_energy = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
}
};

struct OutputDataGPU {
cms::cuda::device::unique_ptr<int[]> pfrh_topoId;
cms::cuda::device::unique_ptr<int[]> pfrh_isSeed;
Expand All @@ -294,6 +228,9 @@ namespace PFClustering {
cms::cuda::device::unique_ptr<int[]> pcrhFracSize; // Total number of pfc fractions to copy back
cms::cuda::device::unique_ptr<int[]> nEdges; // Sum total number of rechit neighbours

cms::cuda::device::unique_ptr<float4[]> pfc_pos4;
cms::cuda::device::unique_ptr<float[]> pfc_energy;

void allocate(ConfigurationParameters const& config, cudaStream_t cudaStream = cudaStreamDefault) {
pfrh_topoId = cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH, cudaStream);
pfrh_isSeed = cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH, cudaStream);
Expand All @@ -312,6 +249,9 @@ namespace PFClustering {
topoIter = cms::cuda::make_device_unique<int[]>(sizeof(int), cudaStream);
pcrhFracSize = cms::cuda::make_device_unique<int[]>(sizeof(int), cudaStream);
nEdges = cms::cuda::make_device_unique<int[]>(sizeof(int), cudaStream);

pfc_pos4 = cms::cuda::make_device_unique<float4[]>(sizeof(float4) * config.maxRH, cudaStream);
pfc_energy = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
}
};

Expand All @@ -321,6 +261,9 @@ namespace PFClustering {
cms::cuda::device::unique_ptr<int[]> pfrh_edgeList;
cms::cuda::device::unique_ptr<int[]> pfrh_edgeMask;

cms::cuda::device::unique_ptr<float[]> pcrh_fracSum;
cms::cuda::device::unique_ptr<float4[]> pfc_prevPos4;

void allocate(ConfigurationParameters const& config, cudaStream_t cudaStream = cudaStreamDefault) {
rhcount = cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH, cudaStream);
pfrh_edgeId =
Expand All @@ -329,6 +272,10 @@ namespace PFClustering {
cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH * config.maxNeighbors, cudaStream);
pfrh_edgeMask =
cms::cuda::make_device_unique<int[]>(sizeof(int) * config.maxRH * config.maxNeighbors, cudaStream);

pcrh_fracSum = cms::cuda::make_device_unique<float[]>(sizeof(float) * config.maxRH, cudaStream);
pfc_prevPos4 = cms::cuda::make_device_unique<float4[]>(sizeof(float4) * config.maxRH, cudaStream);

}
};
} // namespace HCAL
Expand Down
20 changes: 7 additions & 13 deletions RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
Expand Up @@ -1303,12 +1303,7 @@ namespace PFClusterCudaHCAL {
gridStride = blockDim.x;
iter = 0;
notDone = true;
//debug = true;
debug = false;
//int s0 = topoSeedList[topoSeedBegin];
//debug = (s0==33 || s0==49 || s0==50 || s0==57 || s0==58 || s0==59 || s0==36 || s0==37 || s0==39 || s0==46 || s0==55 || s0==61 || s0==63 || s0==99 || s0==105 || s0==112 || s0==114 || s0==122 || s0==109 || s0==116 || s0==66 || s0==67 || s0==72 || s0==74 || s0==75 || s0==81 || s0==70 || s0==76 || s0==78 || s0==353 || s0==360 || s0==362 || s0==368 || s0==377 || s0==378 || s0==366 || s0==373 || s0==375 || s0==382 || s0==2 || s0==11 || s0==16 || s0==26 || s0==12 || s0==14 || s0==22 || s0==29 || s0==31 || s0==323 || s0==328 || s0==331 || s0==344 || s0==345 || s0==325 || s0==326 || s0==349 || s0==163 || s0==177 || s0==165 || s0==175 || s0==182 || s0==190 || s0==130 || s0==138 || s0==146 || s0==153 || s0==155 || s0==135 || s0==141 || s0==149 || s0==151 || s0==157 || s0==159 || s0==234 || s0==248 || s0==250 || s0==254 || s0==418 || s0==424 || s0==421 || s0==422 || s0==429 || s0==431 || s0==436 || s0==438 || s0==444 || s0==446 || s0==202 || s0==209 || s0==196 || s0==207 || s0==384 || s0==386 || s0==394 || s0==402 || s0==411 || s0==388 || s0==391 || s0==397 || s0==405 || s0==412 || s0==414 || s0==305 || s0==309 || s0==480 || s0==483 || s0==488 || s0==498 || s0==506 || s0==486 || s0==494 || s0==500 || s0==509 || s0==259 || s0==264 || s0==267 || s0==275 || s0==280 || s0==282 || s0==269 || s0==271 || s0==278 || s0==284 || s0==450 || s0==458 || s0==466 || s0==472 || s0==475 || s0==469 || s0==479 || s0==548 || s0==550 || s0==565 || s0==608 || s0==617 || s0==613 || s0==615 || s0==620 || s0==623 || s0==639 || s0==592 || s0==595 || s0==589 || s0==597 || s0==599 || s0==604 || s0==866 || s0==872 || s0==874 || s0==881 || s0==886 || s0==892 || s0==841 || s0==858 || s0==846 || s0==855 || s0==860 || s0==701 || s0==702 || s0==643 || s0==646 || s0==654 || s0==662 || s0==739 || s0==746 || s0==752 || s0==755 || s0==740 || s0==743 || s0==759 || s0==767 || s0==531 || s0==525 || s0==527 || s0==543 || s0==929 || s0==945 || s0==934 || s0==956 || s0==959 || s0==704 || s0==705 || s0==707 || s0==714 || s0==720 || s0==723 || s0==728 || s0==729 || s0==730 || s0==731 || s0==708 || s0==710 || s0==718 || s0==725 || s0==732 || s0==735 || s0==993 || s0==995 || s0==1008 || s0==1010 || s0==1014 || s0==1015 || s0==1020 || s0==896 || s0==898 || s0==920 || s0==922 || s0==903 || s0==910 || s0==918 || s0==800 || s0==809 || s0==804 || s0==805 || s0==822 || s0==777 || s0==779 || s0==786 || s0==792 || s0==794 || s0==772 || s0==780 || s0==781 || s0==798 || s0==960 || s0==978 || s0==984 || s0==964 || s0==966 || s0==973 || s0==975 || s0==980 || s0==981 || s0==1059 || s0==1072 || s0==1081 || s0==1070 || s0==1076 || s0==1079 || s0==1086 || s0==1147 || s0==1124 || s0==1135 || s0==1141 || s0==1149 || s0==1090 || s0==1098 || s0==1114 || s0==1095 || s0==1109 || s0==1119 || s0==1386 || s0==1387 || s0==1401 || s0==1396 || s0==1404 || s0==1347 || s0==1352 || s0==1355 || s0==1360 || s0==1185 || s0==1186 || s0==1194 || s0==1203 || s0==1210 || s0==1189 || s0==1207 || s0==1034 || s0==1040 || s0==1042 || s0==1049 || s0==1031 || s0==1044 || s0==1047 || s0==1154 || s0==1161 || s0==1177 || s0==1157 || s0==1164 || s0==1166 || s0==1172 || s0==1180 || s0==1249 || s0==1250 || s0==1251 || s0==1258 || s0==1266 || s0==1255 || s0==1260 || s0==1262 || s0==1270 || s0==1276 || s0==1409 || s0==1411 || s0==1418 || s0==1426 || s0==1427 || s0==1435 || s0==1415 || s0==1420 || s0==1422 || s0==1430 || s0==1441 || s0==1442 || s0==1443 || s0==1449 || s0==1458 || s0==1464 || s0==1467 || s0==1447 || s0==1461 || s0==1504 || s0==1515 || s0==1520 || s0==1529 || s0==1509 || s0==1511 || s0==1517 || s0==1524 || s0==1527 || s0==1532 || s0==1534 || s0==1320 || s0==1323 || s0==1331 || s0==1338 || s0==1317 || s0==1326 || s0==1335 || s0==1343 || s0==1218 || s0==1233 || s0==1234 || s0==1242 || s0==1243 || s0==1228 || s0==1230 || s0==1237 || s0==1239 || s0==1244 || s0==1481 || s0==1479 || s0==1492 || s0==1283 || s0==1289 || s0==1290 || s0==1304 || s0==1307 || s0==1293 || s0==1536 || s0==1539 || s0==1546 || s0==1553 || s0==1541 || s0==1557 || s0==1558 || s0==2114) ? true : false;

//debug = (nSeeds == 2 && ( (topoSeedList[topoSeedBegin]==11 && topoSeedList[topoSeedBegin+1]==5) || (topoSeedList[topoSeedBegin]==5 && topoSeedList[topoSeedBegin+1]==11) )) ? true : false;

int i = topoSeedList[topoSeedBegin];
if (pfrh_layer[i] == PFLayer::HCAL_BARREL1)
Expand Down Expand Up @@ -4537,16 +4532,15 @@ namespace PFClusterCudaHCAL {
cudaStream_t cudaStream,
int nEdges,
::hcal::PFRecHitCollection<::pf::common::DevStoragePolicy> const& inputPFRecHits,
::PFClustering::HCAL::InputDataGPU& inputGPU,
::PFClustering::HCAL::OutputDataCPU& outputCPU,
::PFClustering::HCAL::OutputDataGPU& outputGPU,
::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
float (&timer)[8]) {

int nRH = inputPFRecHits.size;
const int threadsPerBlock = 256;
const int nRH = inputPFRecHits.size;

// Combined seeding & topo clustering thresholds, array initialization
seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
seedingTopoThreshKernel_HCAL<<<(nRH + threadsPerBlock -1) / threadsPerBlock, threadsPerBlock, 0, cudaStream>>>(nRH,
inputPFRecHits.pfrh_energy.get(),
inputPFRecHits.pfrh_x.get(),
inputPFRecHits.pfrh_y.get(),
Expand Down Expand Up @@ -4624,16 +4618,16 @@ namespace PFClusterCudaHCAL {
inputPFRecHits.pfrh_neighbours.get(),
outputGPU.pcrh_frac.get(),
outputGPU.pcrh_fracInd.get(),
inputGPU.pcrh_fracSum.get(),
scratchGPU.pcrh_fracSum.get(),
scratchGPU.rhcount.get(),
outputGPU.topoSeedCount.get(),
outputGPU.topoRHCount.get(),
outputGPU.seedFracOffsets.get(),
outputGPU.topoSeedOffsets.get(),
outputGPU.topoSeedList.get(),
inputGPU.pfc_pos4.get(),
inputGPU.pfc_prevPos4.get(),
inputGPU.pfc_energy.get(),
outputGPU.pfc_pos4.get(),
scratchGPU.pfc_prevPos4.get(),
outputGPU.pfc_energy.get(),
outputGPU.pfc_iter.get());
}
} // namespace PFClusterCudaHCAL
Expand Up @@ -54,8 +54,6 @@ namespace PFClusterCudaHCAL {
cudaStream_t cudaStream,
int nEdges,
::hcal::PFRecHitCollection<::pf::common::DevStoragePolicy> const& inputPFRecHits,
::PFClustering::HCAL::InputDataGPU& inputGPU,
::PFClustering::HCAL::OutputDataCPU& outputCPU,
::PFClustering::HCAL::OutputDataGPU& outputGPU,
::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
float (&timer)[8]);
Expand Down
Expand Up @@ -74,9 +74,6 @@ class PFClusterProducerCudaHCAL : public edm::stream::EDProducer<edm::ExternalWo
PFClustering::HCAL::ConfigurationParameters cudaConfig_;
PFClustering::common::CudaHCALConstants cudaConstants;

PFClustering::HCAL::InputDataCPU inputCPU;
PFClustering::HCAL::InputDataGPU inputGPU;

PFClustering::HCAL::OutputDataCPU outputCPU;
PFClustering::HCAL::OutputDataGPU outputGPU;

Expand Down Expand Up @@ -268,13 +265,6 @@ void PFClusterProducerCudaHCAL::fillDescriptions(edm::ConfigurationDescriptions&
}

void PFClusterProducerCudaHCAL::beginLuminosityBlock(const edm::LuminosityBlock& lumi, const edm::EventSetup& es) {
/* KenH
_initialClustering->update(es);
if (_pfClusterBuilder)
_pfClusterBuilder->update(es);
if (_positionReCalc)
_positionReCalc->update(es);
*/
initCuda_ = true; // (Re)initialize cuda arrays
}

Expand All @@ -295,26 +285,13 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event,
// Only allocate Cuda memory on first event
PFClusterCudaHCAL::initializeCudaConstants(cudaConstants, cudaStream);

inputCPU.allocate(cudaConfig_, cudaStream);
inputGPU.allocate(cudaConfig_, cudaStream);

outputCPU.allocate(cudaConfig_, cudaStream);
outputGPU.allocate(cudaConfig_, cudaStream);
scratchGPU.allocate(cudaConfig_, cudaStream);

initCuda_ = false;
}

/* KenH
_initialClustering->reset();
if (_pfClusterBuilder)
_pfClusterBuilder->reset();
*/

/* KenH
_initialClustering->updateEvent(event);
*/

nRH_ = PFRecHits.size;
if (nRH_ == 0) return;
if (nRH_>4000) std::cout << "nRH(PFRecHitSize)>4000: " << nRH_ << std::endl;
Expand All @@ -328,28 +305,23 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event,
cudaCheck(cudaStreamSynchronize(cudaStream));

// Calling cuda kernels
PFClusterCudaHCAL::PFRechitToPFCluster_HCAL_entryPoint(cudaStream, totalNeighbours, PFRecHits, inputGPU, outputCPU, outputGPU, scratchGPU, kernelTimers);
PFClusterCudaHCAL::PFRechitToPFCluster_HCAL_entryPoint(cudaStream, totalNeighbours, PFRecHits, outputGPU, scratchGPU, kernelTimers);

if (!_produceLegacy) return; // do device->host transfer only when we are producing Legacy data

// Data transfer from GPU
if (cudaStreamQuery(cudaStream) != cudaSuccess)
cudaCheck(cudaStreamSynchronize(cudaStream));

cudaCheck(cudaMemcpyAsync(
outputCPU.topoIter.get(), outputGPU.topoIter.get(), sizeof(int), cudaMemcpyDeviceToHost, cudaStream));
cudaCheck(cudaMemcpyAsync(
outputCPU.pcrhFracSize.get(), outputGPU.pcrhFracSize.get(), sizeof(int), cudaMemcpyDeviceToHost, cudaStream));
cudaCheck(
cudaMemcpyAsync(outputCPU.nEdges.get(), outputGPU.nEdges.get(), sizeof(int), cudaMemcpyDeviceToHost, cudaStream));

if (cudaStreamQuery(cudaStream) != cudaSuccess)
cudaCheck(cudaStreamSynchronize(cudaStream));

// Total size of allocated rechit fraction arrays (includes some extra padding for rechits that don't end up passing cuts)
const Int_t nFracs = outputCPU.pcrhFracSize[0];

cudaCheck(cudaMemcpyAsync(
outputCPU.pfc_iter.get(), outputGPU.pfc_iter.get(), numbytes_int, cudaMemcpyDeviceToHost, cudaStream));

cudaCheck(cudaMemcpyAsync(
outputCPU.topoSeedCount.get(), outputGPU.topoSeedCount.get(), numbytes_int, cudaMemcpyDeviceToHost, cudaStream));

Expand All @@ -362,15 +334,6 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event,
cudaMemcpyDeviceToHost,
cudaStream));

cudaCheck(cudaMemcpyAsync(outputCPU.topoSeedOffsets.get(),
outputGPU.topoSeedOffsets.get(),
numbytes_int,
cudaMemcpyDeviceToHost,
cudaStream));

cudaCheck(cudaMemcpyAsync(
outputCPU.topoSeedList.get(), outputGPU.topoSeedList.get(), numbytes_int, cudaMemcpyDeviceToHost, cudaStream));

cudaCheck(cudaMemcpyAsync(outputCPU.pcrh_fracInd.get(),
outputGPU.pcrh_fracInd.get(),
sizeof(int) * nFracs,
Expand All @@ -384,12 +347,6 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event,
cudaCheck(cudaMemcpyAsync(
outputCPU.pfrh_topoId.get(), outputGPU.pfrh_topoId.get(), numbytes_int, cudaMemcpyDeviceToHost, cudaStream));

cudaCheck(cudaMemcpyAsync(outputCPU.pfrh_passTopoThresh.get(),
outputGPU.pfrh_passTopoThresh.get(),
sizeof(int) * nRH_,
cudaMemcpyDeviceToHost,
cudaStream));

if (cudaStreamQuery(cudaStream) != cudaSuccess)
cudaCheck(cudaStreamSynchronize(cudaStream));
}
Expand Down Expand Up @@ -448,6 +405,7 @@ void PFClusterProducerCudaHCAL::produce(edm::Event& event, const edm::EventSetup

event.put(std::move(pfClustersFromCuda));
}

}

DEFINE_FWK_MODULE(PFClusterProducerCudaHCAL);

0 comments on commit 989035a

Please sign in to comment.