/
fullProgram.cpp
274 lines (232 loc) · 10 KB
/
fullProgram.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#include "reference_calc.cpp"
#include "utils.h"
#include <stdio.h>
#include <float.h>
#include <stdlib.h>
#include "timer.h"
#include <string>
//////////// HELPER FCNS AND KERNELS //////////////////
#define getDigit(val, start, radix) ((val>>start) & (radix - 1))
#define ITEMSPERTHREAD 4
// basically memset, but for unsigned int's only
__device__ void k_assignRange(unsigned int * start, unsigned int value, unsigned int numElems)
{
for (int i = 0; i < numElems; i++)
*(start + i) = value;
}
// basically memcpy, but for unsigned int's only
__device__ void k_copyRange(unsigned int * output, unsigned int * input, unsigned int numElems)
{
for (int i = 0; i < numElems; i++)
output[i] = input[i];
}
// performs a prefix sum, where each thread has the value pred
__device__ unsigned int scan(unsigned int pred, unsigned int * sh_sums)
{
sh_sums[threadIdx.x] = pred;
__syncthreads();
for (int skip = 1; skip < blockDim.x; skip *= 2)
{
int newValue = (threadIdx.x >= skip) ? sh_sums[threadIdx.x] + sh_sums[threadIdx.x - skip] : sh_sums[threadIdx.x];
syncthreads();
sh_sums[threadIdx.x] = newValue;
syncthreads();
}
if (threadIdx.x > 0)
return sh_sums[threadIdx.x - 1];
else
return 0;
}
// like above, where each thread handles multiple values and outputs several different values in the prefix sum
// itemsThisThread will equal itesmPerThread except if this block has threads that have no items to work with
__device__ void scanMultiple(unsigned int * outputSums,
unsigned int * inputVals,
unsigned int localID, // first value in sh_sums[] this thread deals with
unsigned int numElems,
unsigned int * sh_sums, // shared memory for computing the sums
unsigned int itemsThisThread) // # of items this thread works with
{
k_copyRange(sh_sums + localID, inputVals, itemsThisThread);
__syncthreads();
unsigned int newValues[ITEMSPERTHREAD];
for (int skip = 1; skip < numElems; skip *= 2)
{
for (int i = 0; i < itemsThisThread; i++)
{
if (localID + i >= skip)
newValues[i] = sh_sums[localID + i] + sh_sums[localID + i - skip];
else
newValues[i] = sh_sums[localID + i];
}
__syncthreads();
k_copyRange(sh_sums + localID, newValues, itemsThisThread);
__syncthreads();
}
// write output
if (threadIdx.x > 0)
outputSums[0] = sh_sums[localID - 1];
else
outputSums[0] = 0;
k_copyRange(outputSums + 1, sh_sums + localID, itemsThisThread - 1);
}
// outputs the "rank" of each item, for partitioning the block by predicate value
__device__ unsigned int split(bool pred, unsigned int blocksize, unsigned int * sh_sums)
{
unsigned int true_before = scan(pred, sh_sums);
__shared__ unsigned int false_total;
if(threadIdx.x == blocksize - 1)
false_total = blocksize - (true_before + pred);
__syncthreads();
if(pred)
return true_before + false_total;
else
return threadIdx.x - true_before;
}
// single-block radix sort
__global__ void k_radixSortBlock(unsigned int * d_vals,
unsigned int * d_pos,
unsigned int startBit,
unsigned int radix,
unsigned int numElems)
{
int inputID = threadIdx.x + blockDim.x * blockIdx.x;
if (inputID < numElems)
{
extern __shared__ unsigned int sh_arr[];
unsigned int * sh_sums = sh_arr;
unsigned int * sh_vals = sh_arr + blockDim.x;
unsigned int * sh_pos = sh_arr + blockDim.x * 2;
sh_vals[threadIdx.x] = d_vals[inputID];
sh_pos[threadIdx.x] = d_pos[inputID];
__syncthreads();
for (int d = 1; d < radix; d <<= 1)
{
unsigned int i = split(((sh_vals[threadIdx.x]>>startBit) & d) > 0, min(numElems - blockDim.x * blockIdx.x, blockDim.x), sh_sums);
unsigned int oldValue = sh_vals[threadIdx.x];
unsigned int oldPos = sh_pos[threadIdx.x];
__syncthreads();
sh_vals[i] = oldValue;
sh_pos[i] = oldPos;
__syncthreads();
}
d_vals[blockDim.x * blockIdx.x + threadIdx.x] = sh_vals[threadIdx.x];
d_pos[blockDim.x * blockIdx.x + threadIdx.x] = sh_pos[threadIdx.x];
}
}
// 1st step to calculating globalOffsets (offsets for each block and each radix)
// this step simply counts the # of each radix value in each block, later we do a scan on that to get globalOffsets
// as well as calculating localOffsets (offsets within each block for each radix)
// d_bucketSize[i][j] is the # of elements of radix i that are in block j
// send only enough threads to look at blockSize - 1 items (since each thread compares to the next item in the block)
__global__ void k_findOffsets(unsigned int * d_globalOffsets,
unsigned int * d_localOffsets,
unsigned int * d_vals,
unsigned int startBit,
unsigned int radix,
unsigned int numElems)
{
extern __shared__ unsigned int sh_arr[]; // for storing offsets
unsigned int inputID = threadIdx.x + blockDim.x * blockIdx.x;
unsigned int blockSize = min(numElems - blockDim.x * blockIdx.x, blockDim.x);
int thisDigit = getDigit(d_vals[inputID], startBit, radix);
if (inputID < numElems)
{
// missing radix values before the first item? 1st thread gives their offset as 0
if (threadIdx.x == 0)
k_assignRange(sh_arr, 0, thisDigit + 1);
// missing radix values after the last item? last thread gives their offset as blockSize
if (threadIdx.x == blockSize - 1)
k_assignRange(sh_arr + thisDigit + 1, blockSize, radix - 1 - thisDigit);
else
{
int nextDigit = getDigit(d_vals[inputID + 1], startBit, radix);
// assign offsets for all value(s) between this digit and the next digit, including the next digit but not this one
if (nextDigit > thisDigit)
k_assignRange(sh_arr + thisDigit + 1, threadIdx.x + 1, nextDigit - thisDigit);
}
__syncthreads();
// index both output arrays in bucket-major order
unsigned int outputID = blockIdx.x + gridDim.x * threadIdx.x;
if (threadIdx.x < radix - 1)
{
d_localOffsets[outputID] = sh_arr[threadIdx.x];
d_globalOffsets[outputID] = sh_arr[threadIdx.x + 1] - sh_arr[threadIdx.x];
}
else if (threadIdx.x == radix - 1)
{
d_localOffsets[outputID] = sh_arr[threadIdx.x];
d_globalOffsets[outputID] = blockSize - sh_arr[threadIdx.x];
}
}
}
__global__ void k_scan(unsigned int * d_vals,
unsigned int numElems)
{
unsigned int localID = threadIdx.x * ITEMSPERTHREAD;
unsigned int inputID = blockDim.x * blockIdx.x + localID;
unsigned int itemsThisThread = min(numElems - localID, ITEMSPERTHREAD);
if (inputID >= numElems)
return;
extern __shared__ unsigned int sh_arr[];
unsigned int outputSums[ITEMSPERTHREAD];
scanMultiple(outputSums, d_vals + inputID, localID, numElems, sh_arr, itemsThisThread);
__syncthreads();
k_copyRange(d_vals + inputID, outputSums, itemsThisThread);
}
__global__ void k_scatter(unsigned int * d_outputVals,
unsigned int * d_inputVals,
unsigned int * d_outputPos,
unsigned int * d_inputPos,
unsigned int * d_localOffsets,
unsigned int * d_globalOffsets,
int startBit,
int radix,
unsigned int numElems)
{
unsigned int inputID = blockDim.x * blockIdx.x + threadIdx.x;
if (inputID >= numElems)
return;
int thisDigit = getDigit(d_inputVals[inputID], startBit, radix);
unsigned int offsetIndex = gridDim.x * thisDigit + blockIdx.x;
unsigned int outputID = threadIdx.x - d_localOffsets[offsetIndex] + d_globalOffsets[offsetIndex];
d_outputVals[outputID] = d_inputVals[inputID];
d_outputPos[outputID] = d_inputPos[inputID];
}
// swap pointers
void exch(unsigned int * * a, unsigned int * * b)
{
unsigned int * temp = *a;
*a = *b;
*b = temp;
}
// based off of this paper: http://mgarland.org/files/papers/nvr-2008-001.pdf
void your_sort (unsigned int* const d_inputVals,
unsigned int* const d_inputPos,
unsigned int* const d_outputVals,
unsigned int* const d_outputPos,
const size_t numElems)
{
// PREFERENCES
const unsigned int blockSize = 512;
const int numBits = 3;
const unsigned int radix = pow(2,numBits);
unsigned int numBlocks = (numElems - 1) / blockSize + 1;
assert((radix * numBlocks - 1) / ITEMSPERTHREAD + 1 < 1024);
unsigned int * d_globalOffsets, * d_localOffsets;
checkCudaErrors(cudaMalloc(&d_globalOffsets, radix * numBlocks * sizeof(unsigned int)));
checkCudaErrors(cudaMalloc(&d_localOffsets, radix * numBlocks * sizeof(unsigned int)));
unsigned int * d_valuesA = d_inputVals;
unsigned int * d_valuesB = d_outputVals;
unsigned int * d_posB = d_outputPos;
unsigned int * d_posA = d_inputPos;
for (int d = 0; d < 32; d += numBits)
{
k_radixSortBlock<<<numBlocks, blockSize, 3 * blockSize*sizeof(unsigned int)>>>(d_valuesA, d_posA, d, radix, numElems);
k_findOffsets<<<numBlocks, blockSize, (radix + 1)*sizeof(unsigned int)>>>(d_globalOffsets, d_localOffsets, d_valuesA, d, radix, numElems);
k_scan<<<1, (radix * numBlocks - 1) / ITEMSPERTHREAD + 1, radix * numBlocks * sizeof(unsigned int)>>>(d_globalOffsets,numElems);
k_scatter<<<numBlocks, blockSize>>>(d_valuesB, d_valuesA, d_posB, d_posA, d_localOffsets, d_globalOffsets, d, radix, numElems);
exch(&d_valuesA, &d_valuesB);
exch(&d_posA, &d_posB);
}
checkCudaErrors(cudaMemcpy(d_outputPos, d_posA, numElems *sizeof(unsigned int), cudaMemcpyDeviceToDevice));
}