-
Notifications
You must be signed in to change notification settings - Fork 1
/
accum_correct_slow2.cpp
216 lines (176 loc) · 7.77 KB
/
accum_correct_slow2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
// Greg Stitt
// University of Florida
//
// accum_correct_slow2.cpp
//
// This SYCL program will create a parallel (vectorized) version of the following
// sequential code:
//
// int accum = 0;
// for (int i=0; i < VECTOR_SIZE; i++)
// accum += x[i];
//
// This example improves performance significantly over the previous one
// by leveraging work-groups and local memory to 1) minimize repeated accesses
// to slower global memory, and 2) minimize transfers between the host and
// device.
//
// The overall strategy is similar to before, where we index into the input
// array using a stride that increases for each iteration in order to avoid
// conflicts where work-items overwrite inputs. However, we now do this at
// a much finer granularity.
//
// Basically, we leverage this same approach, but only within work groups.
// Each group first transfers its portion of the inputs from global memory.
// Internally, each group iterates in the exact same was the previous example,
// but because we can synchronize work-items within a group, we don't have to
// leverage the host to do the synchronization. This minimizes the number
// of times the hosts has to start the kernel, and also minimizes the number
// of times the global data is copied.
//
// For a visual explanation of this strategy, see slides ADDLATER.
//
// When running the example on the DevCloud, the execution time of this example
// for 1000000000 (1 billion) inputs was 1.87s.
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <cmath>
#include <chrono>
#include <CL/sycl.hpp>
class accum;
void print_usage(const std::string& name) {
std::cout << "Usage: " << name << " vector_size (must be positive)" << std::endl;
}
int main(int argc, char* argv[]) {
// Check correct usage of command line.
if (argc != 2) {
print_usage(argv[0]);
return 1;
}
// Get vector size from command line.
int vector_size;
vector_size = atoi(argv[1]);
if (vector_size <= 0) {
print_usage(argv[0]);
return 1;
}
std::chrono::time_point<std::chrono::system_clock> start_time, end_time;
std::vector<int> x_h(vector_size);
int correct_out = 0;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dist(-10, 10);
for (size_t i=0; i < vector_size; i++) {
x_h[i] = dist(gen);
correct_out += x_h[i];
}
try {
// We separate the device selection from the queue creation here
// because we will need the device object to query information
// about local memory.
cl::sycl::device device;
cl::sycl::default_selector_v(device);
cl::sycl::queue queue(device, [] (sycl::exception_list el) {
for (auto ex : el) { std::rethrow_exception(ex); }
} );
// We first have to decide on how many work-items there will be
// in each group. This is often called the work-group size, but
// I'm avoiding that name here due to many other "sizes"
// (e.g., vectors, number of inputs, buffers, local memory, etc.).
int work_items_per_group = 32;
// Since each work-item adds two inputs, the number of inputs
// processed by each group is the work items per group * 2.
int inputs_per_group = work_items_per_group * 2;
// Get the size of the local memory on the device.
auto local_mem_size = device.get_info<sycl::info::device::local_mem_size>();
// Check for errors with the local memory.
if (local_mem_size < (work_items_per_group * sizeof(int))) {
throw std::runtime_error("Insufficient local memory on device.");
}
start_time = std::chrono::system_clock::now();
// This could potentially be optimized by moving the buffer creation
// inside the loop and using num_global_inputs, which shrinks each iteration.
// However, it seems to slow down performance, which occurs because the
// destruction of the buffer triggers a transfer of data back to the
// host each iteration.
cl::sycl::buffer<int, 1> x_buf {x_h.data(), cl::sycl::range<1>(vector_size) };
int num_global_inputs = vector_size;
while(num_global_inputs > 1) {
int num_groups = ceil(num_global_inputs / float(inputs_per_group));
queue.submit([&](cl::sycl::handler& handler) {
cl::sycl::accessor x_d(x_buf, handler, cl::sycl::read_write);
// We are using local memory, so we need an additional accessor for that memory.
cl::sycl::local_accessor<int, 1> x_local(cl::sycl::range<1>(work_items_per_group), handler);
// Unlike the previous examples, we use an nd_randge instead of range to
// specify work-group sizes. Also, we use an nd_item intead of an id
// because the nd_item provides synchronization methods.
handler.parallel_for<class accum>(cl::sycl::nd_range<1>(num_groups * work_items_per_group, work_items_per_group), [=](cl::sycl::nd_item<1> item) {
// Get the global, local and group IDs from the nd_range.
size_t global_id = item.get_global_linear_id();
size_t local_id = item.get_local_linear_id();
size_t group_id = item.get_group_linear_id();
// Initialize the local memory for the current work-item.
x_local[local_id] = 0;
// Perform the first add from global memory, while simply
// copying data in the case of an odd number of inputs.
if (2*global_id + 1 == num_global_inputs) {
x_local[local_id] = x_d[2*global_id];
}
else if (2*global_id + 1 < num_global_inputs) {
x_local[local_id] = x_d[2*global_id] + x_d[2*global_id + 1];
}
// Wait for all work-items in the group to finish the first add.
// This is necessary because we need to ensure the local memory
// is fully loaded before doing more adds.
item.barrier(cl::sycl::access::fence_space::local_space);
// Leverage the same strategy as the previous example, just
// within a work-group instead of globally.
int stride = 1;
for (int num_local_inputs = inputs_per_group; num_local_inputs > 1; num_local_inputs = ceil(num_local_inputs/2.0)) {
int base = 2*stride*local_id;
if (2*local_id + 1 < num_local_inputs)
x_local[base] = x_local[base] + x_local[base + stride];
stride *= 2;
// When using work-groups, we can synchronize the execution of
// work-items within the group, so unlike the previous
// examples where a loop inside the kernel code would lead
// work-items progressing through the loop at different
// rates, we can now synchronize their execution so that
// a new iteration does start until all local work-items
// have finished the previous iteration.
item.barrier(cl::sycl::access::fence_space::local_space);
}
// Write the result (x_local[0]) of the work-group to global memory
// for the next iteration of the outer loop. This writes the outputs
// sequentially (with a stride of 1), so each iteration of the
// outer loop is just a smaller instance of the original problem.
if (local_id == 0) {
x_d[group_id] = x_local[0];
}
});
});
queue.wait();
num_global_inputs = num_groups;
}
// We don't have to do this here for the code to work,
// but we want to include the output transfer times
// in our execution time calculations. At this point,
// x_buf hasn't been destructed yet, so we need to
// explictly copy it before sampling the ending time.
x_buf.get_access<cl::sycl::access::mode::read>();
end_time = std::chrono::system_clock::now();
}
catch (cl::sycl::exception& e) {
std::cout << e.what() << std::endl;
return 1;
}
if (correct_out != x_h[0]) {
std::cout << "ERROR: output was " << x_h[0] << " instead of " << correct_out << std::endl;
return 1;
}
std::chrono::duration<double> seconds = end_time - start_time;
std::cout << "SUCCESS! Time: " << seconds.count() << "s" << std::endl;
return 0;
}