-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu-vector.cpp
110 lines (81 loc) · 2.83 KB
/
gpu-vector.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include <stdlib.h>
#include <time.h>
#include "hip/hip_runtime.h"
__global__ void vector_euclidian_distance(int width, int n, int *a, int *b, double *c)
{
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < n)
{
double accumulator = 0;
int *row = (a + thread_id);
for (int i = 0; i < width; i++)
{
accumulator += pow(((double)*(row + i) - (double)*(b + i)), 2);
}
*(c + thread_id) = sqrt(accumulator);
}
}
int *generate_random_int_vector(int width, int num_vectors)
{
int *vectors = (int *)malloc(width * num_vectors * sizeof(int));
//fill with random values
for (int i = 0; i < num_vectors; i++)
{
for (int j = 0; j < width; j++)
{
*(vectors + i * width + j) = rand() % 100;
}
}
return vectors;
}
void print_int_vector(int *vector, int width, int num_vectors)
{
for (int i = 0; i < num_vectors; i++)
{
for (int j = 0; j < width; j++)
{
printf("{%d,%d} = %d\n", i, j, *(vector + i * width + j));
}
}
}
int main(int argc, char *argv[])
{
srand(time(NULL));
const int BLOCK_SIZE = 1024;
const int VECTOR_WIDTH = 5;
const int NUMBER_VECTORS = 1024 * 16;
const int NUM_GRIDS = (NUMBER_VECTORS / BLOCK_SIZE) + 1;
//generate the input & target vector
int *host_vectors = generate_random_int_vector(VECTOR_WIDTH, NUMBER_VECTORS);
// print_int_vector(host_vectors, VECTOR_WIDTH, NUMBER_VECTORS);
int *host_target = generate_random_int_vector(VECTOR_WIDTH, 1);
// print_int_vector(host_target, VECTOR_WIDTH, 1);
//container for the output
double *host_output = (double *)malloc(NUMBER_VECTORS * sizeof(double));
//device IO, allocate & copy
int *device_vectors;
int *device_target;
double *device_output;
int vectors_bytes = VECTOR_WIDTH * NUMBER_VECTORS * sizeof(int);
int target_bytes = VECTOR_WIDTH * sizeof(int);
int output_bytes = NUMBER_VECTORS * sizeof(double);
hipMalloc(&device_vectors, vectors_bytes);
hipMalloc(&device_target, target_bytes);
hipMalloc(&device_output, output_bytes);
hipMemcpyHtoD(device_vectors, host_vectors, vectors_bytes);
hipMemcpyHtoD(device_target, host_target, vectors_bytes);
// Launch the kernel
hipLaunchKernelGGL(vector_euclidian_distance, dim3(NUM_GRIDS), dim3(BLOCK_SIZE), 0, 0, VECTOR_WIDTH, NUMBER_VECTORS, device_vectors, device_target, device_output);
hipDeviceSynchronize();
//copy output back to the host
hipMemcpyDtoH(host_output, device_output, output_bytes);
printf("Freeing host...\n");
free(host_vectors);
free(host_target);
free(host_output);
printf("Freeing device...\n");
hipFree(device_vectors);
hipFree(device_target);
hipFree(device_output);
return EXIT_SUCCESS;
}