-
Notifications
You must be signed in to change notification settings - Fork 22
/
main.cu
315 lines (278 loc) · 11.2 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#include <cuda_runtime_api.h>
#include <iostream>
#include <vector>
#include <random>
#include <chrono>
#include <tuple>
#include <utility>
#include <numeric>
#include <iomanip>
#include "../../shared/include/utility.h"
// Declare a GPU-visible floating point variable in global memory.
__device__ float dResult;
/*
The most basic reduction kernel uses atomic operations to accumulate
the individual inputs in a single, device-wide visible variable.
If you have experience with atomics, it is important to note that the
basic atomicXXX instructions of CUDA have RELAXED semantics (scary!).
That means, the threads that operate atomically on them only agree that
there is a particular order for the accesses to that variable and nothing
else (especially no acquire/release semantics).
*/
__global__ void reduceAtomicGlobal(const float* __restrict input, int N)
{
const int id = threadIdx.x + blockIdx.x * blockDim.x;
/*
Since all blocks must have the same number of threads,
we may have to launch more threads than there are
inputs. Superfluous threads should not try to read
from the input (out of bounds access!)
*/
if (id < N)
atomicAdd(&dResult, input[id]);
}
/*
First improvement: shared memory is much faster than global
memory. Each block can accumulate partial results in isolated
block-wide visible memory. This relieves the contention on
a single global variable that all threads want access to.
*/
__global__ void reduceAtomicShared(const float* __restrict input, int N)
{
const int id = threadIdx.x + blockIdx.x * blockDim.x;
// Declare a shared float for each block
__shared__ float x;
// Only one thread should initialize this shared value
if (threadIdx.x == 0)
x = 0.0f;
/*
Before we continue, we must ensure that all threads
can see this update (initialization) by thread 0
*/
__syncthreads();
/*
Every thread in the block adds its input to the
shared variable of the block.
*/
if (id < N)
atomicAdd(&x, input[id]);
// Wait until all threads have done their part
__syncthreads();
/*
Once they are all done, only one thread must add
the block's partial result to the global variable.
*/
if (threadIdx.x == 0)
atomicAdd(&dResult, x);
}
/*
Second improvement: choosing a more suitable algorithm.
We can exploit the fact that the GPU is massively parallel
and come up with a fitting procedure that uses multiple
iterations. In each iteration, threads accumulate partial
results from the previous iteration. Before, the contented
accesses to one location forced the GPU to perform updates
sequentially O(N). Now, each thread can access its own,
exclusive shared variable in each iteration in parallel,
giving an effective runtime that is closer to O(log N).
*/
template <unsigned int BLOCK_SIZE>
__global__ void reduceShared(const float* __restrict input, int N)
{
const int id = threadIdx.x + blockIdx.x * blockDim.x;
/*
Use a larger shared memory region so that each
thread can store its own partial results
*/
__shared__ float data[BLOCK_SIZE];
/*
Use a new strategy to handle superfluous threads.
To make sure they stay alive and can help with
the reduction, threads without an input simply
produce a '0', which has no effect on the result.
*/
data[threadIdx.x] = (id < N ? input[id] : 0);
/*
log N iterations to complete. In each step, a thread
accumulates two partial values to form the input for
the next iteration. The sum of all partial results
eventually yields the full result of the reduction.
*/
for (int s = blockDim.x / 2; s > 0; s /= 2)
{
/*
In each iteration, we must make sure that all
threads are done writing the updates of the
previous iteration / the initialization.
*/
__syncthreads();
if (threadIdx.x < s)
data[threadIdx.x] += data[threadIdx.x + s];
}
/*
Note: thread 0 is the last thread to combine two
partial results, and the one who writes to global
memory, therefore no synchronization is required
after the last iteration.
*/
if (threadIdx.x == 0)
atomicAdd(&dResult, data[0]);
}
/*
Warp-level improvement: using warp-level primitives to
accelerate the final steps of the reduction. Warps
have a fast lane for communication. They are free
to exchange values in registers when they are being
scheduled for execution. Warps will be formed from
consecutive threads in groups of 32.
*/
template <unsigned int BLOCK_SIZE>
__global__ void reduceShuffle(const float* __restrict input, int N)
{
const int id = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ float data[BLOCK_SIZE];
data[threadIdx.x] = (id < N ? input[id] : 0);
// Only use shared memory until last 32 values
for (int s = blockDim.x / 2; s > 16; s /= 2)
{
__syncthreads();
if (threadIdx.x < s)
data[threadIdx.x] += data[threadIdx.x + s];
}
// The last 32 values can be handled with warp-level primitives
float x = data[threadIdx.x];
if (threadIdx.x < 32)
{
/*
The threads in the first warp shuffle their registers.
This replaces the last 5 iterations of the previous solution.
The mask indicates which threads participate in the shuffle.
The value indicates which register should be shuffled.
The final parameter gives the source thread from which the
current one should receive the shuffled value. Accesses that
are out of range (>= 32) will wrap around, but are not needed
(they will not affect the final result written by thread 0).
In each shuffle, at least half of the threads only participate
so they can provide useful data from the previous shuffle for
lower threads. To keep the code short, we always let all threads
participate, because it is an error to let threads reach a shuffle
instruction that they don't participate in.
*/
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 16);
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 8);
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 4);
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 2);
x += __shfl_sync(0xFFFFFFFF, x, 1);
}
if (threadIdx.x == 0)
atomicAdd(&dResult, x);
}
/*
Final improvement: half of our threads actually idle after
they have loaded data from global memory to shared! Better
to have threads fetch two values at the start and then let
them all do at least some meaningful work. This means that
compared to all other methods, only half the number of
threads must be launched in the grid!
*/
template <unsigned int BLOCK_SIZE>
__global__ void reduceFinal(const float* __restrict input, int N)
{
const int id = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ float data[BLOCK_SIZE];
// Already combine two values upon load from global memory.
data[threadIdx.x] = id < N / 2 ? input[id] : 0;
data[threadIdx.x] += id + N/2 < N ? input[id + N / 2] : 0;
for (int s = blockDim.x / 2; s > 16; s /= 2)
{
__syncthreads();
if (threadIdx.x < s)
data[threadIdx.x] += data[threadIdx.x + s];
}
float x = data[threadIdx.x];
if (threadIdx.x < 32)
{
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 16);
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 8);
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 4);
x += __shfl_sync(0xFFFFFFFF, x, threadIdx.x + 2);
x += __shfl_sync(0xFFFFFFFF, x, 1);
}
if (threadIdx.x == 0)
atomicAdd(&dResult, x);
}
int main()
{
std::cout << "==== Sample 08 - Reductions ====\n" << std::endl;
/*
Expected output: Accumulated results from CPU and GPU that
approach 42 * NUM_ITEMS (can vary greatly due to floating point
precision limitations).
With more sophisticated techniques, reported performance of the
GPU versions (measured runtime in ms) should generally decrease.
*/
constexpr unsigned int BLOCK_SIZE = 256;
constexpr unsigned int WARMUP_ITERATIONS = 10;
constexpr unsigned int TIMING_ITERATIONS = 20;
constexpr unsigned int N = 100'000'000;
std::cout << "Producing random inputs...\n" << std::endl;
// Generate some random numbers to reduce
std::vector<float> vals;
float* dValsPtr;
samplesutil::prepareRandomNumbersCPUGPU(N, vals, &dValsPtr);
std::cout << "==== CPU Reduction ====\n" << std::endl;
// A reference value is computed by sequential reduction
std::cout << "Computed CPU value: " << std::accumulate(vals.cbegin(), vals.cend(), 0.0f) << std::endl;
std::cout << "==== GPU Reductions ====\n" << std::endl;
/*
Set up a collection of reductions to evaluate for performance.
Each entry gives a technique's name, the kernel to call, and
the number of threads required for each individual technique.
*/
const std::tuple<const char*, void(*)(const float*, int), unsigned int> reductionTechniques[]
{
{"Atomic Global", reduceAtomicGlobal, N},
{"Atomic Shared", reduceAtomicShared, N},
{"Reduce Shared", reduceShared<BLOCK_SIZE>, N},
{"Reduce Shuffle", reduceShuffle<BLOCK_SIZE>, N},
{"Reduce Final", reduceFinal<BLOCK_SIZE>, N / 2 + 1}
};
// Evaluate each technique separately
for (const auto& [name, func, numThreads] : reductionTechniques)
{
// Compute the smallest grid to start required threads with a given block size
const dim3 blockDim = { BLOCK_SIZE, 1, 1 };
const dim3 gridDim = { (numThreads + BLOCK_SIZE - 1) / BLOCK_SIZE, 1, 1 };
// Run several reductions for GPU to warm up
for (int i = 0; i < WARMUP_ITERATIONS; i++)
func<<<gridDim, blockDim>>>(dValsPtr, N);
// Synchronize to ensure CPU only records time after warmup is done
cudaDeviceSynchronize();
const auto before = std::chrono::system_clock::now();
float result = 0.0f;
// Run several iterations to get an average measurement
for (int i = 0; i < TIMING_ITERATIONS; i++)
{
// Reset acummulated result to 0 in each run
cudaMemcpyToSymbol(dResult, &result, sizeof(float));
func<<<gridDim, blockDim>>>(dValsPtr, N);
}
// cudaMemcpyFromSymbol will implicitly synchronize CPU and GPU
cudaMemcpyFromSymbol(&result, dResult, sizeof(float));
// Can measure time without an extra synchronization
const auto after = std::chrono::system_clock::now();
const auto elapsed = 1000.f * std::chrono::duration_cast<std::chrono::duration<float>>(after - before).count();
std::cout << std::setw(20) << name << "\t" << elapsed / TIMING_ITERATIONS << "ms \t" << result << std::endl;
}
// Free the allocated memory for input
cudaFree(dValsPtr);
return 0;
}
/*
Exercises:
1) Change the program so that the methods reduce integer values instead of float.
Can you observe any difference in terms of speed / computed results?
2) Do you have any other ideas how the reduction could be improved?
Making it even faster should be quite challenging, but if you have
some suggestions, try them out and see how they affect performance!
*/