-
Notifications
You must be signed in to change notification settings - Fork 4
/
vec_add_count.cu
135 lines (115 loc) · 3.85 KB
/
vec_add_count.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#include <stdio.h>
#include <math.h>
__global__ void add_in_parallel(int *array_a, int *array_b, int *array_c)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
array_c[tid] = array_a[tid] + array_b[tid];
}
int main()
{
// --------------------------------------------
printf("Begin...\n");
int arraysize = 100000;
int *a_host;
int *b_host;
int *c_host;
int *devresult_host;
a_host = (int *)malloc(arraysize*sizeof(int));
b_host = (int *)malloc(arraysize*sizeof(int));
c_host = (int *)malloc(arraysize*sizeof(int));
devresult_host = (int *)malloc(arraysize*sizeof(int));
for (int i = 0; i < arraysize; i++)
{
a_host[i] = i;
b_host[i] = i;
}
// ---------------------------------------------
printf("Allocating device memory...\n");
int *a_dev;
int *b_dev;
int *c_dev;
cudaMalloc((void**) &a_dev, arraysize*sizeof(int));
cudaMalloc((void**) &b_dev, arraysize*sizeof(int));
cudaMalloc((void**) &c_dev, arraysize*sizeof(int));
// ----------------------------------------------
cudaEvent_t start,stop;
float time_from_host_to_dev;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(a_dev, a_host, arraysize*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b_host, arraysize*sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_from_host_to_dev, start, stop);
printf("Copy host data to device, time used: %0.5g seconds\n", time_from_host_to_dev/1000);
// ----------------------------------------------
float time_of_kernel;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
int blocksize = 512;
int blocknum = ceil(arraysize/double(blocksize));
dim3 dimBlock(blocksize, 1, 1);
dim3 dimGrid(blocknum, 1, 1);
add_in_parallel<<<dimGrid, dimBlock>>>(a_dev, b_dev, c_dev);
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_of_kernel, start, stop);
printf("Add in parallel, time used: %0.5g seconds\n", time_of_kernel/1000);
// ----------------------------------------------
float time_from_dev_to_host;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(devresult_host, c_dev, arraysize*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_from_dev_to_host, start, stop);
printf("Copy dev data to host, time used: %0.5g seconds\n", time_from_dev_to_host/1000);
// -------------------------------------------------
printf("Verify result...\n");
int status = 0;
clock_t start_cpu, end_cpu;
float time_cpu;
start_cpu = clock();
for (int i = 0; i < arraysize; i++)
{
c_host[i] = a_host[i] + b_host[i];
}
end_cpu = clock();
time_cpu = (double)(end_cpu - start_cpu) / CLOCKS_PER_SEC;
for (int i = 0; i < arraysize; i++)
{
if (c_host[i]!=devresult_host[i])
{
status = 1;
}
}
if (status)
{
printf("Failed vervified.\n");
}
else
{
printf("Sucessdully verified.\n");
}
// ----------------------------------------------
printf("Free dev memory\n");
cudaFree(a_dev);
cudaFree(b_dev);
cudaFree(c_dev);
// ----------------------------------------
printf("Free host memory\n");
free(a_host);
free(b_host);
free(c_host);
// ----------------------------------------
printf("\nPerformance: CPU vs. GPU\n");
printf("time cpu:%f\n", time_cpu);
printf("time gpu(kernel):%f\n", time_of_kernel/1000);
return 1;
}