Fix the Riemann fit on GPU (cms-sw#128)

VinInn · Aug 10, 2018 · 0301b0d · 0301b0d
1 parent 8c2bc8f
commit 0301b0d
Show file tree

Hide file tree

Showing 3 changed files with 188 additions and 93 deletions.
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cc
@@ -71,7 +71,7 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
 
   std::vector<float> hits_and_covariances;
   float * hits_and_covariancesGPU = nullptr;
-  std::vector<Rfit::helix_fit> helix_fit_results;
+  Rfit::helix_fit * helix_fit_results = nullptr;
   Rfit::helix_fit * helix_fit_resultsGPU = nullptr;
 
   const int points_in_seed = 4;
@@ -101,26 +101,24 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
   }
 
   // We pretend to have one fit for every seed
-  helix_fit_results.resize(total_seeds);
-
-  cudaCheck(cudaMalloc((void**)&hits_and_covariancesGPU, sizeof(float)*hits_and_covariances.size()));
-  cudaCheck(cudaMalloc((void**)&helix_fit_resultsGPU, sizeof(Rfit::helix_fit)*helix_fit_results.size()));
+  cudaCheck(cudaMallocHost(&helix_fit_results, sizeof(Rfit::helix_fit)*total_seeds));
+  cudaCheck(cudaMalloc(&hits_and_covariancesGPU, sizeof(float)*hits_and_covariances.size()));
+  cudaCheck(cudaMalloc(&helix_fit_resultsGPU, sizeof(Rfit::helix_fit)*total_seeds));
+  cudaCheck(cudaMemset(helix_fit_resultsGPU, 0x00, sizeof(Rfit::helix_fit)*total_seeds ));
   // CUDA MALLOC OF HITS AND COV AND HELIX_FIT RESULTS
 
   // CUDA MEMCOPY HOST2DEVICE OF HITS AND COVS AND HELIX_FIT RESULTS
   cudaCheck(cudaMemcpy(hits_and_covariancesGPU, hits_and_covariances.data(),
-      sizeof(float)*hits_and_covariances.size(), cudaMemcpyHostToDevice));
+      sizeof(float)*hits_and_covariances.size(), cudaMemcpyDefault));
 
   // LAUNCH THE KERNEL FIT
   launchKernelFit(hits_and_covariancesGPU, 12*4*total_seeds, 4,
       bField, helix_fit_resultsGPU);
   // CUDA MEMCOPY DEVICE2HOST OF HELIX_FIT
   cudaCheck(cudaDeviceSynchronize());
-//  cudaCheck(cudaMemcpy(helix_fit_results.data(), helix_fit_resultsGPU,
-//      sizeof(Rfit::helix_fit)*helix_fit_results.size(), cudaMemcpyDeviceToHost));
   cudaCheck(cudaGetLastError());
-  cudaCheck(cudaMemcpy(helix_fit_results.data(), helix_fit_resultsGPU,
-      sizeof(Rfit::helix_fit)*helix_fit_results.size(), cudaMemcpyDeviceToHost));
+  cudaCheck(cudaMemcpy(helix_fit_results, helix_fit_resultsGPU,
+      sizeof(Rfit::helix_fit)*total_seeds, cudaMemcpyDefault));
 
   cudaCheck(cudaFree(hits_and_covariancesGPU));
   cudaCheck(cudaFree(helix_fit_resultsGPU));
@@ -137,7 +135,11 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
     const TrackingRegion& region = regionHitSets.region();
     for(const SeedingHitSet& tuplet: regionHitSets) {
       auto nHits = tuplet.size(); hits.resize(nHits);
-      for (unsigned int iHit = 0; iHit < nHits; ++iHit) hits[iHit] = tuplet[iHit];
+
+      for (unsigned int iHit = 0; iHit < nHits; ++iHit)
+      {
+          hits[iHit] = tuplet[iHit];
+      }
       auto const &fittedTrack = helix_fit_results[counter];
       counter++;
       int iCharge       = fittedTrack.q;
@@ -146,7 +148,7 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
       float valPt       = fittedTrack.par(2);
       float valCotTheta = fittedTrack.par(3);
       float valZip      = fittedTrack.par(4);
-      //
+
       //  PixelTrackErrorParam param(valEta, valPt);
       float errValPhi = std::sqrt(fittedTrack.cov(0, 0));
       float errValTip = std::sqrt(fittedTrack.cov(1, 1));
@@ -178,7 +180,9 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks,
     }
   }
 
-  // skip ovelrapped tracks
+  cudaCheck(cudaFreeHost(helix_fit_results));
+
+  // skip overlapped tracks
   if(!theCleanerName.empty()) {
     edm::ESHandle<PixelTrackCleaner> hcleaner;
     es.get<PixelTrackCleaner::Record>().get(theCleanerName, hcleaner);

diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cu b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cu
@@ -4,18 +4,20 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "PixelTrackReconstructionGPU.h"
 
-#ifndef GPU_DEBUG
-#define GPU_DEBUG 0
-#endif // GPU_DEBUG
-
 using namespace Eigen;
 
-__global__
-void KernelFullFitAllHits(float * hits_and_covariances,
+__global__ void
+KernelFastFitAllHits(float *hits_and_covariances,
     int hits_in_fit,
     int cumulative_size,
-    double B,
-    Rfit::helix_fit * results) {
+    float B,
+    Rfit::helix_fit *results,
+    Rfit::Matrix3xNd *hits,
+    Rfit::Matrix3Nd *hits_cov,
+    Rfit::circle_fit *circle_fit,
+    Vector4d *fast_fit,
+    Rfit::line_fit *line_fit)
+{
   // Reshape Eigen components from hits_and_covariances, using proper thread and block indices
   // Perform the fit
   // Store the results in the proper vector, using again correct indices
@@ -24,101 +26,187 @@ void KernelFullFitAllHits(float * hits_and_covariances,
   //   first 3 are the points
   //   the rest is the covariance matrix, 3x3
   int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12;
-  int helix_start = (blockIdx.x * blockDim.x + threadIdx.x); 
+  int helix_start = (blockIdx.x * blockDim.x + threadIdx.x);
   if (start >= cumulative_size) {
     return;
   }
 
-#if GPU_DEBUG
+#ifdef GPU_DEBUG
   printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, cumulative_size: %d\n",
       blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size);
 #endif
 
-  Rfit::Matrix3xNd hits(3,hits_in_fit);
-  Rfit::Matrix3Nd hits_cov(3 * hits_in_fit, 3 * hits_in_fit);
+  hits[helix_start].resize(3, hits_in_fit);
+  hits_cov[helix_start].resize(3 * hits_in_fit, 3 * hits_in_fit);
 
   // Prepare data structure (stack)
   for (unsigned int i = 0; i < hits_in_fit; ++i) {
-    hits.col(i) << hits_and_covariances[start],
-      hits_and_covariances[start+1],hits_and_covariances[start+2];
+    hits[helix_start].col(i) << hits_and_covariances[start],
+        hits_and_covariances[start + 1], hits_and_covariances[start + 2];
     start += 3;
 
     for (auto j = 0; j < 3; ++j) {
       for (auto l = 0; l < 3; ++l) {
-        hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = hits_and_covariances[start];
+        hits_cov[helix_start](i + j * hits_in_fit, i + l * hits_in_fit) =
+            hits_and_covariances[start];
         start++;
       }
     }
   }
 
-#if GPU_DEBUG
-  printf("KernelFullFitAllHits hits(0,0): %d\t%f\n", helix_start, hits(0,0));
-  printf("KernelFullFitAllHits hits(0,1): %d\t%f\n", helix_start, hits(0,1));
-  printf("KernelFullFitAllHits hits(0,2): %d\t%f\n", helix_start, hits(0,2));
-  printf("KernelFullFitAllHits hits(0,3): %d\t%f\n", helix_start, hits(0,3));
-  printf("KernelFullFitAllHits hits(1,0): %d\t%f\n", helix_start, hits(1,0));
-  printf("KernelFullFitAllHits hits(1,1): %d\t%f\n", helix_start, hits(1,1));
-  printf("KernelFullFitAllHits hits(1,2): %d\t%f\n", helix_start, hits(1,2));
-  printf("KernelFullFitAllHits hits(1,3): %d\t%f\n", helix_start, hits(1,3));
-  printf("KernelFullFitAllHits hits(2,0): %d\t%f\n", helix_start, hits(2,0));
-  printf("KernelFullFitAllHits hits(2,1): %d\t%f\n", helix_start, hits(2,1));
-  printf("KernelFullFitAllHits hits(2,2): %d\t%f\n", helix_start, hits(2,2));
-  printf("KernelFullFitAllHits hits(2,3): %d\t%f\n", helix_start, hits(2,3));
-  Rfit::printIt(&hits);
-  Rfit::printIt(&hits_cov);
-#endif
+  fast_fit[helix_start] = Rfit::Fast_fit(hits[helix_start]);
+}
 
-  // Perform actual fit
-  Vector4d fast_fit = Rfit::Fast_fit(hits);
+__global__ void
+KernelCircleFitAllHits(float *hits_and_covariances, int hits_in_fit,
+                       int cumulative_size, float B, Rfit::helix_fit *results,
+                       Rfit::Matrix3xNd *hits, Rfit::Matrix3Nd *hits_cov,
+                       Rfit::circle_fit *circle_fit, Vector4d *fast_fit,
+                       Rfit::line_fit *line_fit)
+{
+  // Reshape Eigen components from hits_and_covariances, using proper thread and block indices
+  // Perform the fit
+  // Store the results in the proper vector, using again correct indices
 
-#if GPU_DEBUG
-  printf("KernelFullFitAllHits fast_fit(0): %d %f\n", helix_start, fast_fit(0));
-  printf("KernelFullFitAllHits fast_fit(1): %d %f\n", helix_start, fast_fit(1));
-  printf("KernelFullFitAllHits fast_fit(2): %d %f\n", helix_start, fast_fit(2));
-  printf("KernelFullFitAllHits fast_fit(3): %d %f\n", helix_start, fast_fit(3));
-#endif
+  // Loop for hits_in_fit times:
+  //   first 3 are the points
+  //   the rest is the covariance matrix, 3x3
+  int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12;
+  int helix_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (start >= cumulative_size) {
+    return;
+  }
 
-  u_int n = hits.cols();
-#if GPU_DEBUG
-  printf("KernelFullFitAllHits using %d hits: %d\n", n, helix_start);
+#ifdef GPU_DEBUG
+    printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, "
+           "cumulative_size: %d\n",
+           blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size);
+#endif
+  u_int n = hits[helix_start].cols();
+
+  Rfit::VectorNd rad = (hits[helix_start].block(0, 0, 2, n).colwise().norm());
+
+  circle_fit[helix_start] =
+      Rfit::Circle_fit(hits[helix_start].block(0, 0, 2, n),
+                       hits_cov[helix_start].block(0, 0, 2 * n, 2 * n),
+                       fast_fit[helix_start], rad, B, true, true);
+
+#ifdef GPU_DEBUG
+    printf("KernelCircleFitAllHits circle.par(0): %d %f\n", helix_start,
+           circle_fit[helix_start].par(0));
+    printf("KernelCircleFitAllHits circle.par(1): %d %f\n", helix_start,
+           circle_fit[helix_start].par(1));
+    printf("KernelCircleFitAllHits circle.par(2): %d %f\n", helix_start,
+           circle_fit[helix_start].par(2));
 #endif
 
-  Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm());
+}
+
+__global__ void
+KernelLineFitAllHits(float *hits_and_covariances, int hits_in_fit,
+                     int cumulative_size, float B, Rfit::helix_fit *results,
+                     Rfit::Matrix3xNd *hits, Rfit::Matrix3Nd *hits_cov,
+                     Rfit::circle_fit *circle_fit, Vector4d *fast_fit,
+                     Rfit::line_fit *line_fit)
+{
+  // Reshape Eigen components from hits_and_covariances, using proper thread and block indices
+  // Perform the fit
+  // Store the results in the proper vector, using again correct indices
+
+  // Loop for hits_in_fit times:
+  //   first 3 are the points
+  //   the rest is the covariance matrix, 3x3
+  int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12;
+  int helix_start = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (start >= cumulative_size) {
+    return;
+  }
 
-  Rfit::circle_fit circle = 
-    Rfit::Circle_fit(hits.block(0,0,2,n), hits_cov.block(0, 0, 2 * n, 2 * n),
-      fast_fit, rad, B, true, true);
+#ifdef GPU_DEBUG
 
-#if GPU_DEBUG
-  printf("KernelFullFitAllHits circle.par(0): %d %f\n", helix_start, circle.par(0));
-  printf("KernelFullFitAllHits circle.par(1): %d %f\n", helix_start, circle.par(1));
-  printf("KernelFullFitAllHits circle.par(2): %d %f\n", helix_start, circle.par(2));
+    printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, "
+           "cumulative_size: %d\n",
+           blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size);
 #endif
 
-  Rfit::line_fit line = Rfit::Line_fit(hits, hits_cov, circle, fast_fit, true);
+  line_fit[helix_start] =
+      Rfit::Line_fit(hits[helix_start], hits_cov[helix_start],
+                     circle_fit[helix_start], fast_fit[helix_start], true);
 
-  par_uvrtopak(circle, B, true);
+  par_uvrtopak(circle_fit[helix_start], B, true);
 
   // Grab helix_fit from the proper location in the output vector
   Rfit::helix_fit &helix = results[helix_start];
-  helix.par << circle.par, line.par;
+  helix.par << circle_fit[helix_start].par, line_fit[helix_start].par;
+
   // TODO: pass properly error booleans
-  if (1) {
-    helix.cov = MatrixXd::Zero(5, 5);
-    helix.cov.block(0, 0, 3, 3) = circle.cov;
-    helix.cov.block(3, 3, 2, 2) = line.cov;
-  }
-  helix.q = circle.q;
-  helix.chi2_circle = circle.chi2;
-  helix.chi2_line = line.chi2;
+
+  helix.cov = MatrixXd::Zero(5, 5);
+  helix.cov.block(0, 0, 3, 3) = circle_fit[helix_start].cov;
+  helix.cov.block(3, 3, 2, 2) = line_fit[helix_start].cov;
+
+  helix.q = circle_fit[helix_start].q;
+  helix.chi2_circle = circle_fit[helix_start].chi2;
+  helix.chi2_line = line_fit[helix_start].chi2;
+
+#ifdef GPU_DEBUG
+
+    printf("KernelLineFitAllHits line.par(0): %d %f\n", helix_start,
+           circle_fit[helix_start].par(0));
+    printf("KernelLineFitAllHits line.par(1): %d %f\n", helix_start,
+           line_fit[helix_start].par(1));
+#endif
 }
 
-void PixelTrackReconstructionGPU::launchKernelFit(float * hits_and_covariancesGPU, 
-    int cumulative_size, int hits_in_fit, float B, Rfit::helix_fit * results) {
-  const dim3 threads_per_block(32,1);
-  // We need to partition data in blocks of:
-  // 12(3+9) * hits_in_fit
-  int num_blocks = cumulative_size/(hits_in_fit*12)/threads_per_block.x + 1;
-  KernelFullFitAllHits<<<num_blocks, threads_per_block>>>(hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results);
+void PixelTrackReconstructionGPU::launchKernelFit(
+    float *hits_and_covariancesGPU, int cumulative_size, int hits_in_fit,
+    float B, Rfit::helix_fit *results)
+{
+  const dim3 threads_per_block(32, 1);
+  int num_blocks = cumulative_size / (hits_in_fit * 12) / threads_per_block.x + 1;
+  auto numberOfSeeds = cumulative_size / (hits_in_fit * 12);
+
+  Rfit::Matrix3xNd *hitsGPU;
+  cudaCheck(cudaMalloc(&hitsGPU, 48 * numberOfSeeds * sizeof(Rfit::Matrix3xNd(3, 4))));
+  cudaCheck(cudaMemset(hitsGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::Matrix3xNd(3, 4))));
+
+  Rfit::Matrix3Nd *hits_covGPU = nullptr;
+  cudaCheck(cudaMalloc(&hits_covGPU, 48 * numberOfSeeds * sizeof(Rfit::Matrix3Nd(12, 12))));
+  cudaCheck(cudaMemset(hits_covGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::Matrix3Nd(12, 12))));
+
+  Vector4d *fast_fit_resultsGPU = nullptr;
+  cudaCheck(cudaMalloc(&fast_fit_resultsGPU, 48 * numberOfSeeds * sizeof(Vector4d)));
+  cudaCheck(cudaMemset(fast_fit_resultsGPU, 0x00, 48 * numberOfSeeds * sizeof(Vector4d)));
+
+  Rfit::circle_fit *circle_fit_resultsGPU = nullptr;
+  cudaCheck(cudaMalloc(&circle_fit_resultsGPU, 48 * numberOfSeeds * sizeof(Rfit::circle_fit)));
+  cudaCheck(cudaMemset(circle_fit_resultsGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::circle_fit)));
+
+  Rfit::line_fit *line_fit_resultsGPU = nullptr;
+  cudaCheck(cudaMalloc(&line_fit_resultsGPU, numberOfSeeds * sizeof(Rfit::line_fit)));
+  cudaCheck(cudaMemset(line_fit_resultsGPU, 0x00, numberOfSeeds * sizeof(Rfit::line_fit)));
+
+  KernelFastFitAllHits<<<num_blocks, threads_per_block>>>(
+      hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results,
+      hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU,
+      line_fit_resultsGPU);
   cudaCheck(cudaGetLastError());
+
+  KernelCircleFitAllHits<<<num_blocks, threads_per_block>>>(
+      hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results,
+      hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU,
+      line_fit_resultsGPU);
+  cudaCheck(cudaGetLastError());
+
+  KernelLineFitAllHits<<<num_blocks, threads_per_block>>>(
+      hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results,
+      hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU,
+      line_fit_resultsGPU);
+  cudaCheck(cudaGetLastError());
+
+  cudaFree(hitsGPU);
+  cudaFree(hits_covGPU);
+  cudaFree(fast_fit_resultsGPU);
+  cudaFree(circle_fit_resultsGPU);
+  cudaFree(line_fit_resultsGPU);
 }