Skip to content

Commit

Permalink
Next prototype of the framework integration (#100)
Browse files Browse the repository at this point in the history
Provide a mechanism for a chain of modules to share a resource, that can be e.g. CUDA device memory or a CUDA stream.
Minimize data movements between the CPU and the device, and support multiple devices.
Allow the same job configuration to be used on all hardware combinations.

See HeterogeneousCore/CUDACore/README.md for a more detailed description and examples.
  • Loading branch information
makortel authored and fwyzard committed Nov 6, 2020
1 parent 10c5c8a commit 3b10e5e
Show file tree
Hide file tree
Showing 37 changed files with 1,225 additions and 223 deletions.
1 change: 1 addition & 0 deletions CUDADataFormats/SiPixelCluster/BuildFile.xml
@@ -1,6 +1,7 @@
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
Expand Down
29 changes: 16 additions & 13 deletions CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -1,35 +1,39 @@
#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include <cuda/api_wrappers.h>

class SiPixelClustersCUDA {
public:
SiPixelClustersCUDA() = default;
explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
explicit SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream);
~SiPixelClustersCUDA() = default;

SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA& operator=(const SiPixelClustersCUDA&) = delete;
SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;

void setNClusters(uint32_t nClusters) {
nClusters_h = nClusters;
}

uint32_t nClusters() const { return nClusters_h; }

uint32_t *moduleStart() { return moduleStart_d.get(); }
int32_t *clus() { return clus_d.get(); }
uint32_t *clusInModule() { return clusInModule_d.get(); }
uint32_t *moduleId() { return moduleId_d.get(); }
uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }

uint32_t const *moduleStart() const { return moduleStart_d.get(); }
int32_t const *clus() const { return clus_d.get(); }
uint32_t const *clusInModule() const { return clusInModule_d.get(); }
uint32_t const *moduleId() const { return moduleId_d.get(); }
uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }

uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
int32_t const *c_clus() const { return clus_d.get(); }
uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
uint32_t const *c_moduleId() const { return moduleId_d.get(); }
uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
Expand All @@ -40,7 +44,6 @@ class SiPixelClustersCUDA {

#ifdef __CUDACC__
__device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
__device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); }
__device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
__device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
__device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
Expand All @@ -50,7 +53,6 @@ class SiPixelClustersCUDA {

private:
uint32_t const *moduleStart_;
int32_t const *clus_;
uint32_t const *clusInModule_;
uint32_t const *moduleId_;
uint32_t const *clusModuleStart_;
Expand All @@ -59,15 +61,16 @@ class SiPixelClustersCUDA {
DeviceConstView *view() const { return view_d.get(); }

private:
edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d; // index of the first pixel of each module
edm::cuda::device::unique_ptr<int32_t[]> clus_d; // cluster id of each pixel
edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d; // number of clusters found in each module
edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d; // module id of each module
cudautils::device::unique_ptr<uint32_t[]> moduleStart_d; // index of the first pixel of each module
cudautils::device::unique_ptr<uint32_t[]> clusInModule_d; // number of clusters found in each module
cudautils::device::unique_ptr<uint32_t[]> moduleId_d; // module id of each module

// originally from rechits
edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;
cudautils::device::unique_ptr<uint32_t[]> clusModuleStart_d;

cudautils::device::unique_ptr<DeviceConstView> view_d; // "me" pointer

edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
uint32_t nClusters_h;
};

#endif
15 changes: 7 additions & 8 deletions CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
Expand Up @@ -2,23 +2,22 @@

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

moduleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
clus_d = cs->make_device_unique< int32_t[]>(feds, stream);
clusInModule_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
moduleId_d = cs->make_device_unique<uint32_t[]>(nelements, stream);
clusModuleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
moduleStart_d = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
clusInModule_d = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
moduleId_d = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
clusModuleStart_d = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);

auto view = cs->make_host_unique<DeviceConstView>(stream);
view->moduleStart_ = moduleStart_d.get();
view->clus_ = clus_d.get();
view->clusInModule_ = clusInModule_d.get();
view->moduleId_ = moduleId_d.get();
view->clusModuleStart_ = clusModuleStart_d.get();

view_d = cs->make_device_unique<DeviceConstView>(stream);
cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
cudautils::copyAsync(view_d, view, stream);
}
8 changes: 8 additions & 0 deletions CUDADataFormats/SiPixelCluster/src/classes.h
@@ -0,0 +1,8 @@
#ifndef CUDADataFormats_SiPixelCluster_classes_h
#define CUDADataFormats_SiPixelCluster_classes_h

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
4 changes: 4 additions & 0 deletions CUDADataFormats/SiPixelCluster/src/classes_def.xml
@@ -0,0 +1,4 @@
<lcgdict>
<class name="CUDAProduct<SiPixelClustersCUDA>" persistent="false"/>
<class name="edm::Wrapper<CUDAProduct<SiPixelClustersCUDA>>" persistent="false"/>
</lcgdict>
2 changes: 2 additions & 0 deletions CUDADataFormats/SiPixelDigi/BuildFile.xml
@@ -1,6 +1,8 @@
<use name="DataFormats/SiPixelRawData"/>
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda-api-wrappers"/>
<use name="rootcore"/>

<export>
<lib name="1"/>
Expand Down
40 changes: 40 additions & 0 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
@@ -0,0 +1,40 @@
#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h

#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"

#include <cuda/api_wrappers.h>

class SiPixelDigiErrorsCUDA {
public:
SiPixelDigiErrorsCUDA() = default;
explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream);
~SiPixelDigiErrorsCUDA() = default;

SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
SiPixelDigiErrorsCUDA& operator=(const SiPixelDigiErrorsCUDA&) = delete;
SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default;
SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default;

const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }

GPU::SimpleVector<PixelErrorCompact> *error() { return error_d.get(); }
GPU::SimpleVector<PixelErrorCompact> const *error() const { return error_d.get(); }
GPU::SimpleVector<PixelErrorCompact> const *c_error() const { return error_d.get(); }

using HostDataError = std::pair<GPU::SimpleVector<PixelErrorCompact>, cudautils::host::unique_ptr<PixelErrorCompact[]>>;
HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;

void copyErrorToHostAsync(cuda::stream_t<>& stream);

private:
cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
cudautils::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
PixelFormatterErrors formatterErrors_h;
};

#endif
50 changes: 42 additions & 8 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -1,36 +1,58 @@
#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h

#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
#include "FWCore/Utilities/interface/propagate_const.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include <cuda/api_wrappers.h>

class SiPixelDigisCUDA {
public:
SiPixelDigisCUDA() = default;
explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream);
explicit SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream);
~SiPixelDigisCUDA() = default;

SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
SiPixelDigisCUDA& operator=(const SiPixelDigisCUDA&) = delete;
SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;

void setNModulesDigis(uint32_t nModules, uint32_t nDigis) {
nModules_h = nModules;
nDigis_h = nDigis;
}

uint32_t nModules() const { return nModules_h; }
uint32_t nDigis() const { return nDigis_h; }

uint16_t * xx() { return xx_d.get(); }
uint16_t * yy() { return yy_d.get(); }
uint16_t * adc() { return adc_d.get(); }
uint16_t * moduleInd() { return moduleInd_d.get(); }
int32_t * clus() { return clus_d.get(); }
uint32_t * pdigi() { return pdigi_d.get(); }
uint32_t * rawIdArr() { return rawIdArr_d.get(); }

uint16_t const *xx() const { return xx_d.get(); }
uint16_t const *yy() const { return yy_d.get(); }
uint16_t const *adc() const { return adc_d.get(); }
uint16_t const *moduleInd() const { return moduleInd_d.get(); }
int32_t const *clus() const { return clus_d.get(); }
uint32_t const *pdigi() const { return pdigi_d.get(); }
uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }

uint16_t const *c_xx() const { return xx_d.get(); }
uint16_t const *c_yy() const { return yy_d.get(); }
uint16_t const *c_adc() const { return adc_d.get(); }
uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
int32_t const *c_clus() const { return clus_d.get(); }
uint32_t const *c_pdigi() const { return pdigi_d.get(); }
uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }

cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<>& stream) const;
cudautils::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const;
cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<>& stream) const;
cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<>& stream) const;

class DeviceConstView {
public:
Expand All @@ -41,6 +63,7 @@ class SiPixelDigisCUDA {
__device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
__device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
__device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
__device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_+i); }
#endif

friend class SiPixelDigisCUDA;
Expand All @@ -50,16 +73,27 @@ class SiPixelDigisCUDA {
uint16_t const *yy_;
uint16_t const *adc_;
uint16_t const *moduleInd_;
int32_t const *clus_;
};

const DeviceConstView *view() const { return view_d.get(); }

private:
edm::cuda::device::unique_ptr<uint16_t[]> xx_d; // local coordinates of each pixel
edm::cuda::device::unique_ptr<uint16_t[]> yy_d; //
edm::cuda::device::unique_ptr<uint16_t[]> adc_d; // ADC of each pixel
edm::cuda::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
// These are consumed by downstream device code
cudautils::device::unique_ptr<uint16_t[]> xx_d; // local coordinates of each pixel
cudautils::device::unique_ptr<uint16_t[]> yy_d; //
cudautils::device::unique_ptr<uint16_t[]> adc_d; // ADC of each pixel
cudautils::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
cudautils::device::unique_ptr<int32_t[]> clus_d; // cluster id of each pixel
cudautils::device::unique_ptr<DeviceConstView> view_d; // "me" pointer

// These are for CPU output; should we (eventually) place them to a
// separate product?
cudautils::device::unique_ptr<uint32_t[]> pdigi_d;
cudautils::device::unique_ptr<uint32_t[]> rawIdArr_d;

uint32_t nModules_h = 0;
uint32_t nDigis_h = 0;
};

#endif
44 changes: 44 additions & 0 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -0,0 +1,44 @@
#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"

SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream):
formatterErrors_h(std::move(errors))
{
edm::Service<CUDAService> cs;

error_d = cs->make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
data_d = cs->make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);

cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream);

error_h = cs->make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
assert(error_h->size() == 0);
assert(error_h->capacity() == static_cast<int>(maxFedWords));

cudautils::copyAsync(error_d, error_h, stream);
}

void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) {
cudautils::copyAsync(error_h, error_d, stream);
}

SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const {
edm::Service<CUDAService> cs;
// On one hand size() could be sufficient. On the other hand, if
// someone copies the SimpleVector<>, (s)he might expect the data
// buffer to actually have space for capacity() elements.
auto data = cs->make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);

// but transfer only the required amount
if(error_h->size() > 0) {
cudautils::copyAsync(data, data_d, error_h->size(), stream);
}
auto err = *error_h;
err.set_data(data.get());
return HostDataError(std::move(err), std::move(data));
}
49 changes: 40 additions & 9 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
Expand Up @@ -2,24 +2,55 @@

#include "FWCore/ServiceRegistry/interface/Service.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"

#include <cuda_runtime.h>

SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) {
SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream) {
edm::Service<CUDAService> cs;

xx_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
yy_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
adc_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
moduleInd_d = cs->make_device_unique<uint16_t[]>(nelements, stream);
xx_d = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
yy_d = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
adc_d = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
moduleInd_d = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
clus_d = cs->make_device_unique< int32_t[]>(maxFedWords, stream);

pdigi_d = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);
rawIdArr_d = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);

auto view = cs->make_host_unique<DeviceConstView>(stream);
view->xx_ = xx_d.get();
view->yy_ = yy_d.get();
view->adc_ = adc_d.get();
view->moduleInd_ = moduleInd_d.get();
view->clus_ = clus_d.get();

view_d = cs->make_device_unique<DeviceConstView>(stream);
cudaCheck(cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()));
cudautils::copyAsync(view_d, view, stream);
}

cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
edm::Service<CUDAService> cs;
auto ret = cs->make_host_unique<uint16_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, adc_d, nDigis(), stream);
return ret;
}

cudautils::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const {
edm::Service<CUDAService> cs;
auto ret = cs->make_host_unique<int32_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, clus_d, nDigis(), stream);
return ret;
}

cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const {
edm::Service<CUDAService> cs;
auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, pdigi_d, nDigis(), stream);
return ret;
}

cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const {
edm::Service<CUDAService> cs;
auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
return ret;
}

0 comments on commit 3b10e5e

Please sign in to comment.