Skip to content

Commit

Permalink
Add a test for CUDA library build rules
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Jan 24, 2023
1 parent ed61bcb commit 43491cb
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 0 deletions.
4 changes: 4 additions & 0 deletions HeterogeneousCore/CUDATestDeviceLib/BuildFile.xml
@@ -0,0 +1,4 @@
<use name="cuda"/>
<export>
<lib name="1"/>
</export>
12 changes: 12 additions & 0 deletions HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h
@@ -0,0 +1,12 @@
#ifndef HeterogeneousCore_CUDATestDeviceLib_interface_DeviceAddition_h
#define HeterogeneousCore_CUDATestDeviceLib_interface_DeviceAddition_h

#include <cuda_runtime.h>

__device__
void add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size);

__device__
void add_vectors_d(const double* __restrict__ in1, const double* __restrict__ in2, double* __restrict__ out, size_t size);

#endif // HeterogeneousCore_CUDATestDeviceLib_interface_DeviceAddition_h
25 changes: 25 additions & 0 deletions HeterogeneousCore/CUDATestDeviceLib/src/DeviceAddition.cu
@@ -0,0 +1,25 @@
#include <cstdint>

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h"

__device__
void add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x;
uint32_t stride = blockDim.x * gridDim.x;

for (size_t i = thread; i < size; i += stride) {
out[i] = in1[i] + in2[i];
}
}

__device__
void add_vectors_d(const double* __restrict__ in1, const double* __restrict__ in2, double* __restrict__ out, size_t size) {
uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x;
uint32_t stride = blockDim.x * gridDim.x;

for (size_t i = thread; i < size; i += stride) {
out[i] = in1[i] + in2[i];
}
}
8 changes: 8 additions & 0 deletions HeterogeneousCore/CUDATestDeviceLib/test/BuildFile.xml
@@ -0,0 +1,8 @@
<iftool name="cuda-gcc-support">
<bin file="testDeviceAddition.cu" name="testCudaDeviceAddition">
<use name="catch2"/>
<use name="cuda"/>
<use name="HeterogeneousCore/CUDATestDeviceLib"/>
<use name="HeterogeneousCore/CUDAUtilities" source_only="true"/>
</bin>
</iftool>
89 changes: 89 additions & 0 deletions HeterogeneousCore/CUDATestDeviceLib/test/testDeviceAddition.cu
@@ -0,0 +1,89 @@
#include <cstdint>
#include <random>
#include <vector>

#define CATCH_CONFIG_MAIN
#include <catch.hpp>

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

/*
__device__
void add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x;
uint32_t stride = blockDim.x * gridDim.x;
for (size_t i = thread; i < size; i += stride) {
out[i] = in1[i] + in2[i];
}
}
*/

__global__
void kernel_add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
add_vectors_f(in1, in2, out, size);
}


TEST_CASE("HeterogeneousCore/CUDATestDeviceLib test", "[cudaTestDeviceAddition]") {
// random number generator with a gaussian distribution
std::random_device rd{};
std::default_random_engine rand{rd()};
std::normal_distribution<float> dist{0., 1.};

// tolerance
constexpr float epsilon = 0.000001;

// buffer size
constexpr size_t size = 1024 * 1024;

// allocate input and output host buffers
std::vector<float> in1_h(size);
std::vector<float> in2_h(size);
std::vector<float> out_h(size);

// fill the input buffers with random data, and the output buffer with zeros
for (size_t i = 0; i < size; ++i) {
in1_h[i] = dist(rand);
in2_h[i] = dist(rand);
out_h[i] = 0.;
}

SECTION("Test add_vectors_f") {
// allocate input and output buffers on the device
float* in1_d;
float* in2_d;
float* out_d;
cudaCheck(cudaMalloc(&in1_d, size * sizeof(float)));
cudaCheck(cudaMalloc(&in2_d, size * sizeof(float)));
cudaCheck(cudaMalloc(&out_d, size * sizeof(float)));

// copy the input data to the device
cudaCheck(cudaMemcpy(in1_d, in1_h.data(), size * sizeof(float), cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(in2_d, in2_h.data(), size * sizeof(float), cudaMemcpyHostToDevice));

// fill the output buffer with zeros
cudaCheck(cudaMemset(out_d, 0, size * sizeof(float)));

// launch the 1-dimensional kernel for vector addition
kernel_add_vectors_f<<<32, 32>>>(in1_d, in2_d, out_d, size);
cudaCheck(cudaGetLastError());

// copy the results from the device to the host
cudaCheck(cudaMemcpy(out_h.data(), out_d, size * sizeof(float), cudaMemcpyDeviceToHost));

// wait for all the operations to complete
cudaCheck(cudaDeviceSynchronize());

// check the results
for (size_t i = 0; i < size; ++i) {
float sum = in1_h[i] + in2_h[i];
REQUIRE(out_h[i] < sum + epsilon);
REQUIRE(out_h[i] > sum - epsilon);
}
}

}

0 comments on commit 43491cb

Please sign in to comment.