-
Notifications
You must be signed in to change notification settings - Fork 0
/
cuda_convolution.cu
118 lines (95 loc) · 4.09 KB
/
cuda_convolution.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <cuda_runtime.h>
#include<iostream>
#include<random>
#include "utils.cpp"
#define TILE_WIDTH_X 32
#define TILE_WIDTH_Y 16
#define INPUT_TILE_X (TILE_WIDTH_X + KER - 1)
#define INPUT_TILE_Y (TILE_WIDTH_Y + KER - 1)
__constant__ float ker[KER * KER];
/*
Cuda implementation of 2D convolution using constant memory and shared memory
@param image: input image
@param outputImageData: output image
@param width: image width
@param height: image height
*/
__global__ void device_convolution(const uchar *image, uchar *out, const int width, const int height) {
__shared__ float N_ds[INPUT_TILE_Y][INPUT_TILE_X];
int ker_r = KER / 2;
int dest = threadIdx.y * TILE_WIDTH_X + threadIdx.x;
int dest_y = dest / INPUT_TILE_X;
int dest_x = dest % INPUT_TILE_X;
int src_x = blockIdx.x * TILE_WIDTH_X + dest_x - ker_r;
int src_y = blockIdx.y * TILE_WIDTH_Y + dest_y - ker_r;
if (src_y >= 0 && src_y < height && src_x >= 0 && src_x < width)
N_ds[dest_y][dest_x] = image[(src_y * width + src_x)];
else
N_ds[dest_y][dest_x] = 0;
dest = threadIdx.y * TILE_WIDTH_X + threadIdx.x + TILE_WIDTH_X * TILE_WIDTH_Y;
dest_y = dest / INPUT_TILE_X;
dest_x = dest % INPUT_TILE_X;
src_y = blockIdx.y * TILE_WIDTH_Y + dest_y - ker_r;
src_x = blockIdx.x * TILE_WIDTH_X + dest_x - ker_r;
if (dest_y < INPUT_TILE_Y)
{
if (src_y >= 0 && src_y < height && src_x >= 0 && src_x < width)
N_ds[dest_y][dest_x] = image[(src_y * width + src_x)];
else
N_ds[dest_y][dest_x] = 0;
}
__syncthreads();
float temp = 0;
int y, x;
for (y = 0; y < KER; y++)
for (x = 0; x < KER; x++)
temp += (N_ds[threadIdx.y + y][threadIdx.x + x] * ker[y * KER + x]);
y = blockIdx.y * TILE_WIDTH_Y + threadIdx.y;
x = blockIdx.x * TILE_WIDTH_X + threadIdx.x;
if (y < height && x < width)
//printf("%f %d\n", temp, (int)((uchar)temp));
out[(y * width + x)] = (uchar)temp;
}
__global__ void dumb_device_convolution (uchar *image, uchar *out, int width, int height) {
int ker_r = KER / 2;
if (blockIdx.x * blockDim.x + threadIdx.x < width && blockIdx.y * blockDim.y + threadIdx.y < height) {
float accum = 0;
for (int i = 0; i < KER; i++) {
for (int j = 0; j < KER; j++) {
int y = blockIdx.y * blockDim.y + threadIdx.y - ker_r + i;
int x = blockIdx.x * blockDim.x + threadIdx.x - ker_r + j;
if (y >= 0 && y < height && x >= 0 && x < width) {
accum += image[y * width + x] * ker[i * KER + j];
}
}
}
out[(blockIdx.y * blockDim.y + threadIdx.y) * width + blockIdx.x * blockDim.x + threadIdx.x] = (uchar)accum;
}
}
/*
Wrapper for device convolution
@param image: input image as cv::Mat object
@param kernel_h: kernel
*/
cv::Mat cuda_convolution (const cv::Mat& image, const float kernel_h[KER*KER]) {
uchar *d_input, *d_output;
//size_t pitch;
cudaMalloc(&d_input, image.rows*image.cols*sizeof(uchar));
//cudaMallocPitch(&d_input, &pitch, image.cols*sizeof(uchar), image.rows);
cudaMalloc(&d_output, image.rows*image.cols*sizeof(uchar));
double start = omp_get_wtime();
cv::Mat output(image.rows, image.cols, CV_8UC1, cv::Scalar(0));
cudaMemcpy(d_input, image.data, image.rows*image.cols*sizeof(uchar), cudaMemcpyHostToDevice);
//cudaMemcpy2D(d_input, pitch, image.data, pitch, image.cols * sizeof(uchar), image.rows, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(ker, kernel_h, KER*KER*sizeof(float));
dim3 dimGrid(ceil(image.cols/(float)TILE_WIDTH_X), ceil(image.rows/(float)TILE_WIDTH_Y), 1);
dim3 dimBlock(TILE_WIDTH_X, TILE_WIDTH_Y, 1);
device_convolution<<<dimGrid, dimBlock>>>(d_input, d_output, image.cols, image.rows);
cudaDeviceSynchronize();
cudaMemcpy(output.data, d_output, image.rows*image.cols*sizeof(uchar), cudaMemcpyDeviceToHost);
double end = omp_get_wtime();
std::cout << (end-start)<<std::endl;
cudaFree(d_input);
cudaFree(d_output);
return output;
}