Skip to content

Commit

Permalink
Merge pull request #34286 from CMSTrackerDPG/cpefast_wo_track_angle_3…
Browse files Browse the repository at this point in the history
…0_06_2021

make CPEFast to better reproduce Generic (w/o track angle)
  • Loading branch information
cmsbuild committed Jul 8, 2021
2 parents 232ab28 + a80a100 commit b44af67
Show file tree
Hide file tree
Showing 14 changed files with 360 additions and 116 deletions.
15 changes: 15 additions & 0 deletions CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h
@@ -0,0 +1,15 @@
#ifndef CUDADataFormats_TrackingRecHit_interface_SiPixelHitStatus_H
#define CUDADataFormats_TrackingRecHit_interface_SiPixelHitStatus_H

#include <cstdint>

// more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field
struct SiPixelHitStatus {
bool isBigX : 1; // ∈[0,1]
bool isOneX : 1; // ∈[0,1]
bool isBigY : 1; // ∈[0,1]
bool isOneY : 1; // ∈[0,1]
uint8_t qBin : 3; // ∈[0,1,...,7]
};

#endif
Expand Up @@ -14,10 +14,12 @@ class TrackingRecHit2DHeterogeneous {

TrackingRecHit2DHeterogeneous() = default;

explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream);
explicit TrackingRecHit2DHeterogeneous(
uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);

~TrackingRecHit2DHeterogeneous() = default;

Expand All @@ -41,6 +43,9 @@ class TrackingRecHit2DHeterogeneous {
cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;

// needs specialization for Host
void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);

private:
static constexpr uint32_t n16 = 4; // number of elements in m_store16
static constexpr uint32_t n32 = 10; // number of elements in m_store32
Expand All @@ -65,20 +70,27 @@ class TrackingRecHit2DHeterogeneous {
int16_t* m_iphi;
};

using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;

#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream)
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
uint32_t nHits,
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
uint32_t const* hitsModuleStart,
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
: m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;
m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream); // leave it on host and pass it by value?
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_cpeParams = cpeParams;
view->m_hitsModuleStart = hitsModuleStart;
Expand All @@ -98,15 +110,21 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
// if ordering is relevant they may have to be stored phi-ordered by layer or so
// this will break 1to1 correspondence with cluster and module locality
// so unless proven VERY inefficient we keep it ordered as generated
m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
m_store32 =
Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
m_PhiBinnerStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);

// host copy is "reduced" (to be reviewed at some point)
if constexpr (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
// it has to compile for ALL cases
copyFromGPU(input, stream);
} else {
assert(input == nullptr);
m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
}

static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
auto get32 = [&](int i) { return m_store32.get() + i * nHits; };

// copy all the pointers
Expand All @@ -118,20 +136,25 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
view->m_yl = get32(1);
view->m_xerr = get32(2);
view->m_yerr = get32(3);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));

view->m_xg = get32(4);
view->m_yg = get32(5);
view->m_zg = get32(6);
view->m_rg = get32(7);
if constexpr (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
assert(input == nullptr);
view->m_xg = get32(5);
view->m_yg = get32(6);
view->m_zg = get32(7);
view->m_rg = get32(8);

m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(0));
auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));

view->m_charge = reinterpret_cast<int32_t*>(get32(8));
view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(1);
view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(0);

m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
}

// transfer view
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
Expand Down
53 changes: 53 additions & 0 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
@@ -0,0 +1,53 @@
#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h

#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
#include "CUDADataFormats/Common/interface/HostProduct.h"

// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
class TrackingRecHit2DReduced {
public:
using HLPstorage = HostProduct<float[]>;
using HIDstorage = HostProduct<uint16_t[]>;

template <typename UP32, typename UP16>
TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits)
: m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) {
auto get32 = [&](int i) { return const_cast<float*>(m_store32.get()) + i * nhits; };

// copy all the pointers (better be in sync with the producer store)

m_view.m_xl = get32(0);
m_view.m_yl = get32(1);
m_view.m_xerr = get32(2);
m_view.m_yerr = get32(3);
m_view.m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
m_view.m_detInd = const_cast<uint16_t*>(m_store16.get());
}

// view only!
TrackingRecHit2DReduced(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}

TrackingRecHit2DReduced() = default;
~TrackingRecHit2DReduced() = default;

TrackingRecHit2DReduced(const TrackingRecHit2DReduced&) = delete;
TrackingRecHit2DReduced& operator=(const TrackingRecHit2DReduced&) = delete;
TrackingRecHit2DReduced(TrackingRecHit2DReduced&&) = default;
TrackingRecHit2DReduced& operator=(TrackingRecHit2DReduced&&) = default;

TrackingRecHit2DSOAView& view() { return m_view; }
TrackingRecHit2DSOAView const& view() const { return m_view; }

auto nHits() const { return m_nHits; }

private:
TrackingRecHit2DSOAView m_view;

HLPstorage m_store32;
HIDstorage m_store16;

int m_nHits;
};

#endif
22 changes: 19 additions & 3 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
Expand Up @@ -7,13 +7,17 @@
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
#include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h"

namespace pixelCPEforGPU {
struct ParamsOnGPU;
}

class TrackingRecHit2DSOAView {
public:
using Status = SiPixelHitStatus;
static_assert(sizeof(Status) == sizeof(uint8_t));

using hindex_type = uint32_t; // if above is <=2^32

using PhiBinner = cms::cuda::HistoContainer<int16_t, 128, -1, 8 * sizeof(int16_t), hindex_type, 10>;
Expand All @@ -22,6 +26,7 @@ class TrackingRecHit2DSOAView {

template <typename>
friend class TrackingRecHit2DHeterogeneous;
friend class TrackingRecHit2DReduced;

__device__ __forceinline__ uint32_t nHits() const { return m_nHits; }

Expand All @@ -47,8 +52,18 @@ class TrackingRecHit2DSOAView {
__device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; }
__device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); }

__device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; }
__device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); }
__device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) {
ich = std::min(ich, chargeMask());
uint32_t w = *reinterpret_cast<uint8_t*>(&is);
ich |= (w << 24);
m_chargeAndStatus[i] = ich;
}

__device__ __forceinline__ Status status(int i) const {
uint8_t w = __ldg(m_chargeAndStatus + i) >> 24;
return *reinterpret_cast<Status*>(&w);
}

__device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; }
__device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); }
__device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; }
Expand Down Expand Up @@ -79,7 +94,8 @@ class TrackingRecHit2DSOAView {
int16_t* m_iphi;

// cluster properties
int32_t* m_charge;
static constexpr uint32_t chargeMask() { return (1 << 24) - 1; }
uint32_t* m_chargeAndStatus;
int16_t* m_xsize;
int16_t* m_ysize;
uint16_t* m_detInd;
Expand Down
Expand Up @@ -6,15 +6,22 @@

template <>
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClustering::maxNumModules + 1, stream);
cudaCheck(cudaMemcpyAsync(
ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream));
return ret;
}

// the only specialization needed
template <>
void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
assert(input);
m_store32 = input->localCoordToHostAsync(stream);
}
1 change: 1 addition & 0 deletions CUDADataFormats/TrackingRecHit/src/classes.h
Expand Up @@ -3,6 +3,7 @@

#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif // CUDADataFormats_SiPixelCluster_src_classes_h
6 changes: 4 additions & 2 deletions CUDADataFormats/TrackingRecHit/src/classes_def.xml
@@ -1,8 +1,10 @@
<lcgdict>
<class name="TrackingRecHit2DCPU" persistent="false"/>
<class name="TrackingRecHit2DHost" persistent="false"/>
<class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
<class name="edm::Wrapper<TrackingRecHit2DCPU>" persistent="false"/>
<class name="TrackingRecHit2DHost" persistent="false"/>
<class name="edm::Wrapper<TrackingRecHit2DHost>" persistent="false"/>
<class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
<class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
<class name="TrackingRecHit2DReduced" persistent="false"/>
<class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/>
</lcgdict>
Expand Up @@ -15,12 +15,17 @@ int main() {
cudaStream_t stream;
cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

auto nHits = 200;
// inner scope to deallocate memory before destroying the stream
{
auto nHits = 200;
TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);

testTrackingRecHit2D::runKernels(tkhit.view());

TrackingRecHit2DHost tkhitH(nHits, nullptr, nullptr, stream, &tkhit);
cudaStreamSynchronize(stream);
assert(tkhitH.view());
assert(tkhitH.view()->nHits() == unsigned(nHits));
}

cudaCheck(cudaStreamDestroy(stream));
Expand Down

0 comments on commit b44af67

Please sign in to comment.