-
Notifications
You must be signed in to change notification settings - Fork 1
/
accum_correct_super_slow4.cpp
119 lines (93 loc) · 3.21 KB
/
accum_correct_super_slow4.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// Greg Stitt
// University of Florida
//
// accum_correct_super_slow4.cpp
//
// This SYCL program will create a parallel (vectorized) version of the following
// sequential code:
//
// int accum = 0;
// for (int i=0; i < VECTOR_SIZE; i++)
// accum += x[i];
//
// When running the example on the DevCloud, the execution time of this example
// for 1000000000 (1 billion) inputs was 4.45s.
#include <iostream>
#include <iomanip>
#include <vector>
#include <random>
#include <cmath>
#include <chrono>
#include <CL/sycl.hpp>
class accum;
void print_usage(const std::string& name) {
std::cout << "Usage: " << name << " vector_size (must be positive)" << std::endl;
}
int main(int argc, char* argv[]) {
// Check correct usage of command line.
if (argc != 2) {
print_usage(argv[0]);
return 1;
}
// Get vector size from command line.
int vector_size;
vector_size = atoi(argv[1]);
if (vector_size <= 0) {
print_usage(argv[0]);
return 1;
}
std::chrono::time_point<std::chrono::system_clock> start_time, end_time;
std::vector<int> x_h(vector_size);
std::vector<int> y_h(vector_size);
int correct_out = 0;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dist(-10, 10);
for (size_t i=0; i < vector_size; i++) {
x_h[i] = dist(gen);
correct_out += x_h[i];
}
try {
cl::sycl::queue queue(cl::sycl::default_selector_v, [] (sycl::exception_list el) {
for (auto ex : el) { std::rethrow_exception(ex); }
} );
start_time = std::chrono::system_clock::now();
for (int size = vector_size; size > 1; size = ceil(size / 2.0)) {
cl::sycl::buffer<int, 1> x_buf {x_h.data(), cl::sycl::range<1>(size) };
cl::sycl::buffer<int, 1> y_buf {y_h.data(), cl::sycl::range<1>(ceil(size/2.0)) };
// CHANGES FROM PREVIOUS EXAMPLE
// In this version, we optimize further by not always using the same number of
// work-items, which were previously calculated based on the original vector
// size. Since the size (the number of items to add) shrinks every iteration),
// we can similarly shrink the number of work-items every iteration.
unsigned num_work_items = ceil(size / 2.0);
queue.submit([&](cl::sycl::handler& handler) {
cl::sycl::accessor x_d(x_buf, handler, cl::sycl::read_only);
cl::sycl::accessor y_d(y_buf, handler, cl::sycl::write_only);
handler.parallel_for<class accum>(cl::sycl::range<1> { num_work_items }, [=](cl::sycl::id<1> i) {
if (2*i + 1 == size)
y_d[i] = x_d[2*i];
else if (2*i + 1 < size)
y_d[i] = x_d[2*i] + x_d[2*i+1];
});
});
queue.wait();
y_buf.get_access<cl::sycl::access::mode::read>();
for (int i=0; i < ceil(size/2.0); i++) {
x_h[i] = y_h[i];
}
}
end_time = std::chrono::system_clock::now();
}
catch (cl::sycl::exception& e) {
std::cout << e.what() << std::endl;
return 1;
}
if (correct_out != x_h[0]) {
std::cout << "ERROR: output was " << x_h[0] << " instead of " << correct_out << std::endl;
return 1;
}
std::chrono::duration<double> seconds = end_time - start_time;
std::cout << "SUCCESS! Time: " << seconds.count() << "s" << std::endl;
return 0;
}