/
testDeviceAddition.cu
89 lines (69 loc) · 2.67 KB
/
testDeviceAddition.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#include <cstdint>
#include <random>
#include <vector>
#define CATCH_CONFIG_MAIN
#include <catch.hpp>
#include <cuda_runtime.h>
#include "HeterogeneousCore/CUDATestDeviceLib/interface/DeviceAddition.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
/*
__device__
void add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
uint32_t thread = threadIdx.x + blockIdx.x * blockDim.x;
uint32_t stride = blockDim.x * gridDim.x;
for (size_t i = thread; i < size; i += stride) {
out[i] = in1[i] + in2[i];
}
}
*/
__global__
void kernel_add_vectors_f(const float* __restrict__ in1, const float* __restrict__ in2, float* __restrict__ out, size_t size) {
add_vectors_f(in1, in2, out, size);
}
TEST_CASE("HeterogeneousCore/CUDATestDeviceLib test", "[cudaTestDeviceAddition]") {
// random number generator with a gaussian distribution
std::random_device rd{};
std::default_random_engine rand{rd()};
std::normal_distribution<float> dist{0., 1.};
// tolerance
constexpr float epsilon = 0.000001;
// buffer size
constexpr size_t size = 1024 * 1024;
// allocate input and output host buffers
std::vector<float> in1_h(size);
std::vector<float> in2_h(size);
std::vector<float> out_h(size);
// fill the input buffers with random data, and the output buffer with zeros
for (size_t i = 0; i < size; ++i) {
in1_h[i] = dist(rand);
in2_h[i] = dist(rand);
out_h[i] = 0.;
}
SECTION("Test add_vectors_f") {
// allocate input and output buffers on the device
float* in1_d;
float* in2_d;
float* out_d;
cudaCheck(cudaMalloc(&in1_d, size * sizeof(float)));
cudaCheck(cudaMalloc(&in2_d, size * sizeof(float)));
cudaCheck(cudaMalloc(&out_d, size * sizeof(float)));
// copy the input data to the device
cudaCheck(cudaMemcpy(in1_d, in1_h.data(), size * sizeof(float), cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(in2_d, in2_h.data(), size * sizeof(float), cudaMemcpyHostToDevice));
// fill the output buffer with zeros
cudaCheck(cudaMemset(out_d, 0, size * sizeof(float)));
// launch the 1-dimensional kernel for vector addition
kernel_add_vectors_f<<<32, 32>>>(in1_d, in2_d, out_d, size);
cudaCheck(cudaGetLastError());
// copy the results from the device to the host
cudaCheck(cudaMemcpy(out_h.data(), out_d, size * sizeof(float), cudaMemcpyDeviceToHost));
// wait for all the operations to complete
cudaCheck(cudaDeviceSynchronize());
// check the results
for (size_t i = 0; i < size; ++i) {
float sum = in1_h[i] + in2_h[i];
REQUIRE(out_h[i] < sum + epsilon);
REQUIRE(out_h[i] > sum - epsilon);
}
}
}