From 43491cb5635a4da213a4d3151ed0a309bc1ced03 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 24 Jan 2023 18:32:13 +0100
Subject: [PATCH] Add a test for CUDA library build rules

---
 .../CUDATestDeviceLib/BuildFile.xml           |  4 +
 .../interface/DeviceAddition.h                | 12 +++
 .../CUDATestDeviceLib/src/DeviceAddition.cu   | 25 ++++++
 .../CUDATestDeviceLib/test/BuildFile.xml      |  8 ++
 .../test/testDeviceAddition.cu                | 89 +++++++++++++++++++
 5 files changed, 138 insertions(+)
 create mode 100644 HeterogeneousCore/CUDATestDeviceLib/BuildFile.xml
 create mode 100644 HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h
 create mode 100644 HeterogeneousCore/CUDATestDeviceLib/src/DeviceAddition.cu
 create mode 100644 HeterogeneousCore/CUDATestDeviceLib/test/BuildFile.xml
 create mode 100644 HeterogeneousCore/CUDATestDeviceLib/test/testDeviceAddition.cu
diff --git a/HeterogeneousCore/CUDATestDeviceLib/BuildFile.xml b/HeterogeneousCore/CUDATestDeviceLib/BuildFile.xml
new file mode 100644
index 0000000000000..fbb0f89e301db
--- /dev/null
+++ b/HeterogeneousCore/CUDATestDeviceLib/BuildFile.xml
@@ -0,0 +1,4 @@
+<use name="cuda"/>
+<export>
+  <lib name="1"/>
+</export>
diff --git a/HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h b/HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h
new file mode 100644
index 0000000000000..2fcd7b2fa93cc
--- /dev/null
+++ b/HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h
@@ -0,0 +1,12 @@
+#ifndef HeterogeneousCore_CUDATestDeviceLib_interface_DeviceAddition_h
+#define HeterogeneousCore_CUDATestDeviceLib_interface_DeviceAddition_h
+
+#include <cuda_runtime.h>
+
+__device__ 
+void add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size);
+
+__device__ 
+void add_vectors_d(const double* __restrict__ in1, const double* __restrict__ in2, double* __restrict__ out, size_t size);
+
+#endif  // HeterogeneousCore_CUDATestDeviceLib_interface_DeviceAddition_h
diff --git a/HeterogeneousCore/CUDATestDeviceLib/src/DeviceAddition.cu b/HeterogeneousCore/CUDATestDeviceLib/src/DeviceAddition.cu
new file mode 100644
index 0000000000000..009f8bceed3c2
--- /dev/null
+++ b/HeterogeneousCore/CUDATestDeviceLib/src/DeviceAddition.cu
@@ -0,0 +1,25 @@
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h"
+
+__device__ 
+void add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
+  uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x;
+  uint32_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = thread; i < size; i += stride) {
+    out[i] = in1[i] + in2[i];
+  }
+}
+
+__device__ 
+void add_vectors_d(const double* __restrict__ in1, const double* __restrict__ in2, double* __restrict__ out, size_t size) {
+  uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x;
+  uint32_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = thread; i < size; i += stride) {
+    out[i] = in1[i] + in2[i];
+  }
+}
diff --git a/HeterogeneousCore/CUDATestDeviceLib/test/BuildFile.xml b/HeterogeneousCore/CUDATestDeviceLib/test/BuildFile.xml
new file mode 100644
index 0000000000000..c76079c44be9d
--- /dev/null
+++ b/HeterogeneousCore/CUDATestDeviceLib/test/BuildFile.xml
@@ -0,0 +1,8 @@
+<iftool name="cuda-gcc-support">
+  <bin file="testDeviceAddition.cu" name="testCudaDeviceAddition">
+    <use name="catch2"/>
+    <use name="cuda"/>
+    <use name="HeterogeneousCore/CUDATestDeviceLib"/>
+    <use name="HeterogeneousCore/CUDAUtilities" source_only="true"/>
+  </bin>
+</iftool>
diff --git a/HeterogeneousCore/CUDATestDeviceLib/test/testDeviceAddition.cu b/HeterogeneousCore/CUDATestDeviceLib/test/testDeviceAddition.cu
new file mode 100644
index 0000000000000..b6728f775652f
--- /dev/null
+++ b/HeterogeneousCore/CUDATestDeviceLib/test/testDeviceAddition.cu
@@ -0,0 +1,89 @@
+#include <cstdint>
+#include <random>
+#include <vector>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+/*
+__device__ 
+void add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
+  uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x;
+  uint32_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = thread; i < size; i += stride) {
+    out[i] = in1[i] + in2[i];
+  }
+}
+*/
+
+__global__
+void kernel_add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
+  add_vectors_f(in1, in2, out, size);
+}
+
+
+TEST_CASE("HeterogeneousCore/CUDATestDeviceLib test", "[cudaTestDeviceAddition]") {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine rand{rd()};
+  std::normal_distribution<float> dist{0., 1.};
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // buffer size
+  constexpr size_t size = 1024 * 1024;
+
+  // allocate input and output host buffers
+  std::vector<float> in1_h(size);
+  std::vector<float> in2_h(size);
+  std::vector<float> out_h(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in1_h[i] = dist(rand);
+    in2_h[i] = dist(rand);
+    out_h[i] = 0.;
+  }
+
+  SECTION("Test add_vectors_f") {
+    // allocate input and output buffers on the device
+    float* in1_d;
+    float* in2_d;
+    float* out_d;
+    cudaCheck(cudaMalloc(&in1_d, size * sizeof(float)));
+    cudaCheck(cudaMalloc(&in2_d, size * sizeof(float)));
+    cudaCheck(cudaMalloc(&out_d, size * sizeof(float)));
+
+    // copy the input data to the device
+    cudaCheck(cudaMemcpy(in1_d, in1_h.data(), size * sizeof(float), cudaMemcpyHostToDevice));
+    cudaCheck(cudaMemcpy(in2_d, in2_h.data(), size * sizeof(float), cudaMemcpyHostToDevice));
+
+    // fill the output buffer with zeros
+    cudaCheck(cudaMemset(out_d, 0, size * sizeof(float)));
+
+    // launch the 1-dimensional kernel for vector addition
+    kernel_add_vectors_f<<<32, 32>>>(in1_d, in2_d, out_d, size);
+    cudaCheck(cudaGetLastError());
+
+    // copy the results from the device to the host
+    cudaCheck(cudaMemcpy(out_h.data(), out_d, size * sizeof(float), cudaMemcpyDeviceToHost));
+
+    // wait for all the operations to complete
+    cudaCheck(cudaDeviceSynchronize());
+
+    // check the results
+    for (size_t i = 0; i < size; ++i) {
+      float sum = in1_h[i] + in2_h[i];
+      REQUIRE(out_h[i] < sum + epsilon);
+      REQUIRE(out_h[i] > sum - epsilon);
+    }
+  }
+
+}