Skip to content

Commit

Permalink
Fix the Riemann fit on GPU (cms-sw#128)
Browse files Browse the repository at this point in the history
  • Loading branch information
felicepantaleo authored and fwyzard committed Aug 10, 2018
1 parent 8c2bc8f commit 0301b0d
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 93 deletions.
Expand Up @@ -71,7 +71,7 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,

std::vector<float> hits_and_covariances;
float * hits_and_covariancesGPU = nullptr;
std::vector<Rfit::helix_fit> helix_fit_results;
Rfit::helix_fit * helix_fit_results = nullptr;
Rfit::helix_fit * helix_fit_resultsGPU = nullptr;

const int points_in_seed = 4;
Expand Down Expand Up @@ -101,26 +101,24 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
}

// We pretend to have one fit for every seed
helix_fit_results.resize(total_seeds);

cudaCheck(cudaMalloc((void**)&hits_and_covariancesGPU, sizeof(float)*hits_and_covariances.size()));
cudaCheck(cudaMalloc((void**)&helix_fit_resultsGPU, sizeof(Rfit::helix_fit)*helix_fit_results.size()));
cudaCheck(cudaMallocHost(&helix_fit_results, sizeof(Rfit::helix_fit)*total_seeds));
cudaCheck(cudaMalloc(&hits_and_covariancesGPU, sizeof(float)*hits_and_covariances.size()));
cudaCheck(cudaMalloc(&helix_fit_resultsGPU, sizeof(Rfit::helix_fit)*total_seeds));
cudaCheck(cudaMemset(helix_fit_resultsGPU, 0x00, sizeof(Rfit::helix_fit)*total_seeds ));
// CUDA MALLOC OF HITS AND COV AND HELIX_FIT RESULTS

// CUDA MEMCOPY HOST2DEVICE OF HITS AND COVS AND HELIX_FIT RESULTS
cudaCheck(cudaMemcpy(hits_and_covariancesGPU, hits_and_covariances.data(),
sizeof(float)*hits_and_covariances.size(), cudaMemcpyHostToDevice));
sizeof(float)*hits_and_covariances.size(), cudaMemcpyDefault));

// LAUNCH THE KERNEL FIT
launchKernelFit(hits_and_covariancesGPU, 12*4*total_seeds, 4,
bField, helix_fit_resultsGPU);
// CUDA MEMCOPY DEVICE2HOST OF HELIX_FIT
cudaCheck(cudaDeviceSynchronize());
// cudaCheck(cudaMemcpy(helix_fit_results.data(), helix_fit_resultsGPU,
// sizeof(Rfit::helix_fit)*helix_fit_results.size(), cudaMemcpyDeviceToHost));
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(helix_fit_results.data(), helix_fit_resultsGPU,
sizeof(Rfit::helix_fit)*helix_fit_results.size(), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(helix_fit_results, helix_fit_resultsGPU,
sizeof(Rfit::helix_fit)*total_seeds, cudaMemcpyDefault));

cudaCheck(cudaFree(hits_and_covariancesGPU));
cudaCheck(cudaFree(helix_fit_resultsGPU));
Expand All @@ -137,7 +135,11 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
const TrackingRegion& region = regionHitSets.region();
for(const SeedingHitSet& tuplet: regionHitSets) {
auto nHits = tuplet.size(); hits.resize(nHits);
for (unsigned int iHit = 0; iHit < nHits; ++iHit) hits[iHit] = tuplet[iHit];

for (unsigned int iHit = 0; iHit < nHits; ++iHit)
{
hits[iHit] = tuplet[iHit];
}
auto const &fittedTrack = helix_fit_results[counter];
counter++;
int iCharge = fittedTrack.q;
Expand All @@ -146,7 +148,7 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
float valPt = fittedTrack.par(2);
float valCotTheta = fittedTrack.par(3);
float valZip = fittedTrack.par(4);
//

// PixelTrackErrorParam param(valEta, valPt);
float errValPhi = std::sqrt(fittedTrack.cov(0, 0));
float errValTip = std::sqrt(fittedTrack.cov(1, 1));
Expand Down Expand Up @@ -178,7 +180,9 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
}
}

// skip ovelrapped tracks
cudaCheck(cudaFreeHost(helix_fit_results));

// skip overlapped tracks
if(!theCleanerName.empty()) {
edm::ESHandle<PixelTrackCleaner> hcleaner;
es.get<PixelTrackCleaner::Record>().get(theCleanerName, hcleaner);
Expand Down
Expand Up @@ -4,18 +4,20 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "PixelTrackReconstructionGPU.h"

#ifndef GPU_DEBUG
#define GPU_DEBUG 0
#endif // GPU_DEBUG

using namespace Eigen;

__global__
void KernelFullFitAllHits(float * hits_and_covariances,
__global__ void
KernelFastFitAllHits(float *hits_and_covariances,
int hits_in_fit,
int cumulative_size,
double B,
Rfit::helix_fit * results) {
float B,
Rfit::helix_fit *results,
Rfit::Matrix3xNd *hits,
Rfit::Matrix3Nd *hits_cov,
Rfit::circle_fit *circle_fit,
Vector4d *fast_fit,
Rfit::line_fit *line_fit)
{
// Reshape Eigen components from hits_and_covariances, using proper thread and block indices
// Perform the fit
// Store the results in the proper vector, using again correct indices
Expand All @@ -24,101 +26,187 @@ void KernelFullFitAllHits(float * hits_and_covariances,
// first 3 are the points
// the rest is the covariance matrix, 3x3
int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12;
int helix_start = (blockIdx.x * blockDim.x + threadIdx.x);
int helix_start = (blockIdx.x * blockDim.x + threadIdx.x);
if (start >= cumulative_size) {
return;
}

#if GPU_DEBUG
#ifdef GPU_DEBUG
printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, cumulative_size: %d\n",
blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size);
#endif

Rfit::Matrix3xNd hits(3,hits_in_fit);
Rfit::Matrix3Nd hits_cov(3 * hits_in_fit, 3 * hits_in_fit);
hits[helix_start].resize(3, hits_in_fit);
hits_cov[helix_start].resize(3 * hits_in_fit, 3 * hits_in_fit);

// Prepare data structure (stack)
for (unsigned int i = 0; i < hits_in_fit; ++i) {
hits.col(i) << hits_and_covariances[start],
hits_and_covariances[start+1],hits_and_covariances[start+2];
hits[helix_start].col(i) << hits_and_covariances[start],
hits_and_covariances[start + 1], hits_and_covariances[start + 2];
start += 3;

for (auto j = 0; j < 3; ++j) {
for (auto l = 0; l < 3; ++l) {
hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = hits_and_covariances[start];
hits_cov[helix_start](i + j * hits_in_fit, i + l * hits_in_fit) =
hits_and_covariances[start];
start++;
}
}
}

#if GPU_DEBUG
printf("KernelFullFitAllHits hits(0,0): %d\t%f\n", helix_start, hits(0,0));
printf("KernelFullFitAllHits hits(0,1): %d\t%f\n", helix_start, hits(0,1));
printf("KernelFullFitAllHits hits(0,2): %d\t%f\n", helix_start, hits(0,2));
printf("KernelFullFitAllHits hits(0,3): %d\t%f\n", helix_start, hits(0,3));
printf("KernelFullFitAllHits hits(1,0): %d\t%f\n", helix_start, hits(1,0));
printf("KernelFullFitAllHits hits(1,1): %d\t%f\n", helix_start, hits(1,1));
printf("KernelFullFitAllHits hits(1,2): %d\t%f\n", helix_start, hits(1,2));
printf("KernelFullFitAllHits hits(1,3): %d\t%f\n", helix_start, hits(1,3));
printf("KernelFullFitAllHits hits(2,0): %d\t%f\n", helix_start, hits(2,0));
printf("KernelFullFitAllHits hits(2,1): %d\t%f\n", helix_start, hits(2,1));
printf("KernelFullFitAllHits hits(2,2): %d\t%f\n", helix_start, hits(2,2));
printf("KernelFullFitAllHits hits(2,3): %d\t%f\n", helix_start, hits(2,3));
Rfit::printIt(&hits);
Rfit::printIt(&hits_cov);
#endif
fast_fit[helix_start] = Rfit::Fast_fit(hits[helix_start]);
}

// Perform actual fit
Vector4d fast_fit = Rfit::Fast_fit(hits);
__global__ void
KernelCircleFitAllHits(float *hits_and_covariances, int hits_in_fit,
int cumulative_size, float B, Rfit::helix_fit *results,
Rfit::Matrix3xNd *hits, Rfit::Matrix3Nd *hits_cov,
Rfit::circle_fit *circle_fit, Vector4d *fast_fit,
Rfit::line_fit *line_fit)
{
// Reshape Eigen components from hits_and_covariances, using proper thread and block indices
// Perform the fit
// Store the results in the proper vector, using again correct indices

#if GPU_DEBUG
printf("KernelFullFitAllHits fast_fit(0): %d %f\n", helix_start, fast_fit(0));
printf("KernelFullFitAllHits fast_fit(1): %d %f\n", helix_start, fast_fit(1));
printf("KernelFullFitAllHits fast_fit(2): %d %f\n", helix_start, fast_fit(2));
printf("KernelFullFitAllHits fast_fit(3): %d %f\n", helix_start, fast_fit(3));
#endif
// Loop for hits_in_fit times:
// first 3 are the points
// the rest is the covariance matrix, 3x3
int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12;
int helix_start = (blockIdx.x * blockDim.x + threadIdx.x);
if (start >= cumulative_size) {
return;
}

u_int n = hits.cols();
#if GPU_DEBUG
printf("KernelFullFitAllHits using %d hits: %d\n", n, helix_start);
#ifdef GPU_DEBUG
printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, "
"cumulative_size: %d\n",
blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size);
#endif
u_int n = hits[helix_start].cols();

Rfit::VectorNd rad = (hits[helix_start].block(0, 0, 2, n).colwise().norm());

circle_fit[helix_start] =
Rfit::Circle_fit(hits[helix_start].block(0, 0, 2, n),
hits_cov[helix_start].block(0, 0, 2 * n, 2 * n),
fast_fit[helix_start], rad, B, true, true);

#ifdef GPU_DEBUG
printf("KernelCircleFitAllHits circle.par(0): %d %f\n", helix_start,
circle_fit[helix_start].par(0));
printf("KernelCircleFitAllHits circle.par(1): %d %f\n", helix_start,
circle_fit[helix_start].par(1));
printf("KernelCircleFitAllHits circle.par(2): %d %f\n", helix_start,
circle_fit[helix_start].par(2));
#endif

Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
}

__global__ void
KernelLineFitAllHits(float *hits_and_covariances, int hits_in_fit,
int cumulative_size, float B, Rfit::helix_fit *results,
Rfit::Matrix3xNd *hits, Rfit::Matrix3Nd *hits_cov,
Rfit::circle_fit *circle_fit, Vector4d *fast_fit,
Rfit::line_fit *line_fit)
{
// Reshape Eigen components from hits_and_covariances, using proper thread and block indices
// Perform the fit
// Store the results in the proper vector, using again correct indices

// Loop for hits_in_fit times:
// first 3 are the points
// the rest is the covariance matrix, 3x3
int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12;
int helix_start = (blockIdx.x * blockDim.x + threadIdx.x);
if (start >= cumulative_size) {
return;
}

Rfit::circle_fit circle =
Rfit::Circle_fit(hits.block(0,0,2,n), hits_cov.block(0, 0, 2 * n, 2 * n),
fast_fit, rad, B, true, true);
#ifdef GPU_DEBUG

#if GPU_DEBUG
printf("KernelFullFitAllHits circle.par(0): %d %f\n", helix_start, circle.par(0));
printf("KernelFullFitAllHits circle.par(1): %d %f\n", helix_start, circle.par(1));
printf("KernelFullFitAllHits circle.par(2): %d %f\n", helix_start, circle.par(2));
printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, "
"cumulative_size: %d\n",
blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size);
#endif

Rfit::line_fit line = Rfit::Line_fit(hits, hits_cov, circle, fast_fit, true);
line_fit[helix_start] =
Rfit::Line_fit(hits[helix_start], hits_cov[helix_start],
circle_fit[helix_start], fast_fit[helix_start], true);

par_uvrtopak(circle, B, true);
par_uvrtopak(circle_fit[helix_start], B, true);

// Grab helix_fit from the proper location in the output vector
Rfit::helix_fit &helix = results[helix_start];
helix.par << circle.par, line.par;
helix.par << circle_fit[helix_start].par, line_fit[helix_start].par;

// TODO: pass properly error booleans
if (1) {
helix.cov = MatrixXd::Zero(5, 5);
helix.cov.block(0, 0, 3, 3) = circle.cov;
helix.cov.block(3, 3, 2, 2) = line.cov;
}
helix.q = circle.q;
helix.chi2_circle = circle.chi2;
helix.chi2_line = line.chi2;

helix.cov = MatrixXd::Zero(5, 5);
helix.cov.block(0, 0, 3, 3) = circle_fit[helix_start].cov;
helix.cov.block(3, 3, 2, 2) = line_fit[helix_start].cov;

helix.q = circle_fit[helix_start].q;
helix.chi2_circle = circle_fit[helix_start].chi2;
helix.chi2_line = line_fit[helix_start].chi2;

#ifdef GPU_DEBUG

printf("KernelLineFitAllHits line.par(0): %d %f\n", helix_start,
circle_fit[helix_start].par(0));
printf("KernelLineFitAllHits line.par(1): %d %f\n", helix_start,
line_fit[helix_start].par(1));
#endif
}

void PixelTrackReconstructionGPU::launchKernelFit(float * hits_and_covariancesGPU,
int cumulative_size, int hits_in_fit, float B, Rfit::helix_fit * results) {
const dim3 threads_per_block(32,1);
// We need to partition data in blocks of:
// 12(3+9) * hits_in_fit
int num_blocks = cumulative_size/(hits_in_fit*12)/threads_per_block.x + 1;
KernelFullFitAllHits<<<num_blocks, threads_per_block>>>(hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results);
void PixelTrackReconstructionGPU::launchKernelFit(
float *hits_and_covariancesGPU, int cumulative_size, int hits_in_fit,
float B, Rfit::helix_fit *results)
{
const dim3 threads_per_block(32, 1);
int num_blocks = cumulative_size / (hits_in_fit * 12) / threads_per_block.x + 1;
auto numberOfSeeds = cumulative_size / (hits_in_fit * 12);

Rfit::Matrix3xNd *hitsGPU;
cudaCheck(cudaMalloc(&hitsGPU, 48 * numberOfSeeds * sizeof(Rfit::Matrix3xNd(3, 4))));
cudaCheck(cudaMemset(hitsGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::Matrix3xNd(3, 4))));

Rfit::Matrix3Nd *hits_covGPU = nullptr;
cudaCheck(cudaMalloc(&hits_covGPU, 48 * numberOfSeeds * sizeof(Rfit::Matrix3Nd(12, 12))));
cudaCheck(cudaMemset(hits_covGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::Matrix3Nd(12, 12))));

Vector4d *fast_fit_resultsGPU = nullptr;
cudaCheck(cudaMalloc(&fast_fit_resultsGPU, 48 * numberOfSeeds * sizeof(Vector4d)));
cudaCheck(cudaMemset(fast_fit_resultsGPU, 0x00, 48 * numberOfSeeds * sizeof(Vector4d)));

Rfit::circle_fit *circle_fit_resultsGPU = nullptr;
cudaCheck(cudaMalloc(&circle_fit_resultsGPU, 48 * numberOfSeeds * sizeof(Rfit::circle_fit)));
cudaCheck(cudaMemset(circle_fit_resultsGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::circle_fit)));

Rfit::line_fit *line_fit_resultsGPU = nullptr;
cudaCheck(cudaMalloc(&line_fit_resultsGPU, numberOfSeeds * sizeof(Rfit::line_fit)));
cudaCheck(cudaMemset(line_fit_resultsGPU, 0x00, numberOfSeeds * sizeof(Rfit::line_fit)));

KernelFastFitAllHits<<<num_blocks, threads_per_block>>>(
hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results,
hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU,
line_fit_resultsGPU);
cudaCheck(cudaGetLastError());

KernelCircleFitAllHits<<<num_blocks, threads_per_block>>>(
hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results,
hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU,
line_fit_resultsGPU);
cudaCheck(cudaGetLastError());

KernelLineFitAllHits<<<num_blocks, threads_per_block>>>(
hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results,
hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU,
line_fit_resultsGPU);
cudaCheck(cudaGetLastError());

cudaFree(hitsGPU);
cudaFree(hits_covGPU);
cudaFree(fast_fit_resultsGPU);
cudaFree(circle_fit_resultsGPU);
cudaFree(line_fit_resultsGPU);
}

0 comments on commit 0301b0d

Please sign in to comment.