Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "ITStracking/BoundedAllocator.h"
#include "ITStracking/TimeFrame.h"
#include "ITStracking/Configuration.h"
#include "ITStracking/TrackExtensionCandidate.h"
#include "ITStrackingGPU/Utils.h"

namespace o2::its::gpu
Expand Down Expand Up @@ -90,8 +91,13 @@ class TimeFrameGPU : public TimeFrame<NLayers>
void createNeighboursDevice(const unsigned int layer);
void createNeighboursLUTDevice(const int, const unsigned int);
void createTrackITSExtDevice(const size_t);
void loadTrackExtensionStartTracksDevice();
void createTrackExtensionCandidatesDevice(const size_t);
void createTrackExtensionScratchDevice(const int nThreads, const int beamWidth);
void createTrackExtensionResultsDevice(const size_t);
void downloadTrackITSExtDevice();
void downloadCellsNeighboursDevice(std::vector<bounded_vector<CellNeighbour>>&, const int);
void downloadTrackExtensionResultsDevice();
void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
void downloadCellsDevice();
void downloadCellsLUTDevice();
Expand All @@ -118,13 +124,20 @@ class TimeFrameGPU : public TimeFrame<NLayers>
const auto getDeviceTrackingTopologyView() const { return mDeviceTrackingTopologyView; }
int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; }
auto& getTrackITSExt() { return mTrackITSExt; }
auto& getTrackExtensionResults() { return mTrackExtensionResults; }
Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; }
int* getDeviceROFramesPV() { return mROFramesPVDevice; }
unsigned char* getDeviceUsedClusters(const int);
const o2::base::Propagator* getChainPropagator();

// Hybrid
TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
TrackITSExt* getDeviceTrackExtensionStartTracks() { return mTrackExtensionStartTracksDevice; }
TrackExtensionCandidate<NLayers>* getDeviceTrackExtensionCandidates() { return mTrackExtensionCandidatesDevice; }
int* getDeviceTrackExtensionCandidateOffsets() { return mTrackExtensionCandidateOffsetsDevice; }
TrackExtensionHypothesis<NLayers>* getDeviceActiveTrackExtensionHypotheses() { return mActiveTrackExtensionHypothesesDevice; }
TrackExtensionHypothesis<NLayers>* getDeviceNextTrackExtensionHypotheses() { return mNextTrackExtensionHypothesesDevice; }
TrackExtensionResult<NLayers>* getDeviceTrackExtensionResults() { return mTrackExtensionResultsDevice; }
int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
gsl::span<int*> getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; }
CellNeighbour** getDeviceArrayNeighbours() { return mNeighboursDeviceArray; }
Expand Down Expand Up @@ -222,6 +235,13 @@ class TimeFrameGPU : public TimeFrame<NLayers>
float** mCellSeedsChi2DeviceArray;

TrackITSExt* mTrackITSExtDevice;
TrackITSExt* mTrackExtensionStartTracksDevice{nullptr};
TrackExtensionCandidate<NLayers>* mTrackExtensionCandidatesDevice{nullptr};
int* mTrackExtensionCandidateOffsetsDevice{nullptr};
TrackExtensionHypothesis<NLayers>* mActiveTrackExtensionHypothesesDevice{nullptr};
TrackExtensionHypothesis<NLayers>* mNextTrackExtensionHypothesesDevice{nullptr};
TrackExtensionResult<NLayers>* mTrackExtensionResultsDevice{nullptr};
unsigned int mNTrackExtensionResults{0};
std::array<CellNeighbour*, MaxCells> mNeighboursDevice{};
CellNeighbour** mNeighboursDeviceArray{nullptr};
std::array<TrackingFrameInfo*, NLayers> mTrackingFrameInfoDevice;
Expand All @@ -238,6 +258,9 @@ class TimeFrameGPU : public TimeFrame<NLayers>

// Temporary buffer for storing output tracks from GPU tracking
bounded_vector<TrackITSExt> mTrackITSExt;
bounded_vector<TrackITSExt> mTrackExtensionStartTracks;
// Temporary buffer for fitted track extension proposals from GPU tracking
bounded_vector<TrackExtensionResult<NLayers>> mTrackExtensionResults;
};

template <int NLayers>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
#ifndef ITSTRACKINGGPU_TRACKINGKERNELS_H_
#define ITSTRACKINGGPU_TRACKINGKERNELS_H_

#include <array>
#include <gsl/gsl>

#include "ITStracking/BoundedAllocator.h"
#include "ITStracking/ROFLookupTables.h"
#include "ITStracking/TrackingTopology.h"
#include "ITStracking/TrackExtensionCandidate.h"
#include "ITStrackingGPU/Utils.h"
#include "DetectorsBase/Propagator.h"

Expand All @@ -35,6 +37,58 @@ class Cluster;
class TrackITSExt;
class ExternalAllocator;

inline constexpr int kTrackExtensionLaunchBlocks = 60;
inline constexpr int kTrackExtensionLaunchThreadsPerBlock = 256;
inline constexpr int kTrackExtensionLaunchThreads = kTrackExtensionLaunchBlocks * kTrackExtensionLaunchThreadsPerBlock;
Comment on lines +40 to +42
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should not be in the header IMO, but in the src file. Also starting with k is 'reserved' for enum values while we usually just capitalise them in the ITS code.


template <int NLayers>
void computeTrackExtensionCandidatesHandler(const TrackITSExt* tracks,
const IndexTableUtils<NLayers>* utils,
const typename ROFMaskTable<NLayers>::View& rofMask,
const typename ROFOverlapTable<NLayers>::View& rofOverlaps,
const Cluster** clusters,
const unsigned char** usedClusters,
const int** clustersIndexTables,
const int** ROFClusters,
const TrackingFrameInfo** trackingFrameInfo,
TrackExtensionCandidate<NLayers>* candidates,
int* candidateOffsets,
TrackExtensionHypothesis<NLayers>* activeHypotheses,
TrackExtensionHypothesis<NLayers>* nextHypotheses,
const std::array<float, NLayers> layerRadii,
const std::array<float, NLayers> layerxX0,
const int nTracks,
const int nLayers,
const int phiBins,
const int beamWidth,
const bool extendTop,
const bool extendBot,
const float bz,
const float maxChi2ClusterAttachment,
const float maxChi2NDF,
const float nSigmaCutPhi,
const float nSigmaCutZ,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType,
gpu::Stream& stream);

template <int NLayers>
void computeTrackExtensionResultsHandler(const TrackITSExt* tracks,
const TrackExtensionCandidate<NLayers>* candidates,
const int* candidateOffsets,
TrackExtensionResult<NLayers>* results,
const TrackingFrameInfo** trackingFrameInfo,
const std::array<float, NLayers> layerxX0,
const int nTracks,
const int nLayers,
const float bz,
const float maxChi2ClusterAttachment,
const float maxChi2NDF,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType,
const bool shiftRefToCluster,
gpu::Stream& stream);

template <int NLayers>
void countTrackletsInROFsHandler(const IndexTableUtils<NLayers>* utils,
const typename ROFMaskTable<NLayers>::View& rofMask,
Expand Down Expand Up @@ -208,7 +262,6 @@ void countTrackSeedHandler(TrackSeed<NLayers>* trackSeeds,
const std::vector<float>& layerxX0Host,
const unsigned int nSeeds,
const float Bz,
const int startLevel,
const float maxChi2ClusterAttachment,
const float maxChi2NDF,
const int reseedIfShorter,
Expand All @@ -222,20 +275,35 @@ template <int NLayers>
void computeTrackSeedHandler(TrackSeed<NLayers>* trackSeeds,
const TrackingFrameInfo** foundTrackingFrameInfo,
const Cluster** unsortedClusters,
const IndexTableUtils<NLayers>* utils,
const typename ROFMaskTable<NLayers>::View& rofMask,
const typename ROFOverlapTable<NLayers>::View& rofOverlaps,
const Cluster** clusters,
const unsigned char** usedClusters,
const int** clustersIndexTables,
const int** ROFClusters,
o2::its::TrackITSExt* tracks,
const int* seedLUT,
TrackExtensionHypothesis<NLayers>* activeHypotheses,
TrackExtensionHypothesis<NLayers>* nextHypotheses,
const std::vector<float>& layerRadiiHost,
const std::vector<float>& minPtsHost,
const std::vector<float>& layerxX0Host,
const unsigned int nSeeds,
const unsigned int nTracks,
const float Bz,
const int startLevel,
const float maxChi2ClusterAttachment,
const float maxChi2NDF,
const int reseedIfShorter,
const bool repeatRefitOut,
const bool shiftRefToCluster,
const int nLayers,
const int phiBins,
const int beamWidth,
const bool extendTop,
const bool extendBot,
const float nSigmaCutPhi,
const float nSigmaCutZ,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType,
o2::its::ExternalAllocator* alloc);
Expand Down
78 changes: 78 additions & 0 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <cuda_runtime.h>

#include <algorithm>
#include <unistd.h>
#include <vector>

Expand Down Expand Up @@ -581,6 +582,72 @@ void TimeFrameGPU<NLayers>::createTrackITSExtDevice(const size_t nSeeds)
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt)));
}

template <int NLayers>
void TimeFrameGPU<NLayers>::loadTrackExtensionStartTracksDevice()
{
GPUTimer timer("loading track extension start tracks");
GPULog("gpu-transfer: loading {} track extension start tracks, for {:.2f} MB.", this->mTracks.size(), this->mTracks.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
mTrackExtensionStartTracksDevice = nullptr;
mTrackExtensionStartTracks = bounded_vector<TrackITSExt>(this->mTracks.begin(), this->mTracks.end(), this->getMemoryPool().get());
if (this->mTracks.empty()) {
return;
}
allocMem(reinterpret_cast<void**>(&mTrackExtensionStartTracksDevice), mTrackExtensionStartTracks.size() * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
GPUChkErrS(cudaMemcpy(mTrackExtensionStartTracksDevice, mTrackExtensionStartTracks.data(), mTrackExtensionStartTracks.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyHostToDevice));
}

template <int NLayers>
void TimeFrameGPU<NLayers>::createTrackExtensionCandidatesDevice(const size_t nTracks)
{
GPUTimer timer("reserving track extension candidates");
const size_t nCandidates = nTracks * MaxTrackExtensionCandidatesPerTrack;
GPULog("gpu-allocation: reserving {} track extension candidates, for {:.2f} MB.", nCandidates, nCandidates * sizeof(o2::its::TrackExtensionCandidate<NLayers>) / constants::MB);
mTrackExtensionCandidatesDevice = nullptr;
mTrackExtensionCandidateOffsetsDevice = nullptr;
if (nCandidates == 0) {
return;
}
allocMem(reinterpret_cast<void**>(&mTrackExtensionCandidatesDevice), nCandidates * sizeof(o2::its::TrackExtensionCandidate<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
allocMem(reinterpret_cast<void**>(&mTrackExtensionCandidateOffsetsDevice), (nTracks + 1) * sizeof(int), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
}

template <int NLayers>
void TimeFrameGPU<NLayers>::createTrackExtensionScratchDevice(const int nThreads, const int beamWidth)
{
GPUTimer timer("reserving track extension scratch");
const size_t nHypotheses = static_cast<size_t>(std::max(1, nThreads)) * std::max(1, beamWidth);
GPULog("gpu-allocation: reserving {} track extension hypotheses per scratch buffer, for {:.2f} MB each.", nHypotheses, nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>) / constants::MB);
mActiveTrackExtensionHypothesesDevice = nullptr;
mNextTrackExtensionHypothesesDevice = nullptr;
if (nHypotheses == 0) {
return;
}
allocMem(reinterpret_cast<void**>(&mActiveTrackExtensionHypothesesDevice), nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
allocMem(reinterpret_cast<void**>(&mNextTrackExtensionHypothesesDevice), nHypotheses * sizeof(o2::its::TrackExtensionHypothesis<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
}

template <int NLayers>
void TimeFrameGPU<NLayers>::createTrackExtensionResultsDevice(const size_t nTracks)
{
GPUTimer timer("reserving fitted track extension results");
mNTrackExtensionResults = 0;
if (nTracks == 0 || mTrackExtensionCandidateOffsetsDevice == nullptr) {
mTrackExtensionResults = bounded_vector<TrackExtensionResult<NLayers>>(0, {}, this->getMemoryPool().get());
mTrackExtensionResultsDevice = nullptr;
return;
}
int nResults{0};
GPUChkErrS(cudaMemcpy(&nResults, mTrackExtensionCandidateOffsetsDevice + nTracks, sizeof(int), cudaMemcpyDeviceToHost));
mNTrackExtensionResults = nResults;
GPULog("gpu-allocation: reserving {} fitted track extension results, for {:.2f} MB.", mNTrackExtensionResults, mNTrackExtensionResults * sizeof(o2::its::TrackExtensionResult<NLayers>) / constants::MB);
mTrackExtensionResults = bounded_vector<TrackExtensionResult<NLayers>>(mNTrackExtensionResults, {}, this->getMemoryPool().get());
mTrackExtensionResultsDevice = nullptr;
if (mTrackExtensionResults.empty()) {
return;
}
allocMem(reinterpret_cast<void**>(&mTrackExtensionResultsDevice), mNTrackExtensionResults * sizeof(o2::its::TrackExtensionResult<NLayers>), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
}

template <int NLayers>
void TimeFrameGPU<NLayers>::downloadCellsDevice()
{
Expand Down Expand Up @@ -627,6 +694,17 @@ void TimeFrameGPU<NLayers>::downloadTrackITSExtDevice()
GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, mTrackITSExt.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
}

template <int NLayers>
void TimeFrameGPU<NLayers>::downloadTrackExtensionResultsDevice()
{
GPUTimer timer("downloading fitted track extension results");
GPULog("gpu-transfer: downloading {} fitted track extension results, for {:.2f} MB.", mTrackExtensionResults.size(), mTrackExtensionResults.size() * sizeof(o2::its::TrackExtensionResult<NLayers>) / constants::MB);
if (mTrackExtensionResults.empty()) {
return;
}
GPUChkErrS(cudaMemcpy(mTrackExtensionResults.data(), mTrackExtensionResultsDevice, mTrackExtensionResults.size() * sizeof(o2::its::TrackExtensionResult<NLayers>), cudaMemcpyDeviceToHost));
}

template <int NLayers>
void TimeFrameGPU<NLayers>::unregisterHostMemory(const int maxLayers)
{
Expand Down
Loading