In [34]:
%%writefile  main.cu
#include <iostream>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <chrono>

__constant__ int sobel_x_kernel[9] = {
    -1, 0, 1,
    -2, 0, 2,
    -1, 0, 1
};

__constant__ int sobel_y_kernel[9] = {
    -1, -2, -1,
     0,  0,  0,
     1,  2,  1
};

__global__ void sobel_filter_kernel(const unsigned char* input, unsigned char* output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= 1 && x < width - 1 && y >= 1 && y < height - 1) {
        int gx = 0, gy = 0;
        for (int i = -1; i <= 1; i++) {
            for (int j = -1; j <= 1; j++) {
                int pixel = input[(y + i) * width + (x + j)];
                gx += pixel * sobel_x_kernel[(i + 1) * 3 + (j + 1)];
                gy += pixel * sobel_y_kernel[(i + 1) * 3 + (j + 1)];
            }
        }
        int magnitude = min(255, static_cast<int>(sqrtf(gx * gx + gy * gy)));
        output[y * width + x] = static_cast<unsigned char>(magnitude);
    }
}

void sobel_filter_serial(const unsigned char* input, unsigned char* output, int width, int height) {
    for (int y = 1; y < height - 1; y++) {
        for (int x = 1; x < width - 1; x++) {
            int gx = 0, gy = 0;
            for (int i = -1; i <= 1; i++) {
                for (int j = -1; j <= 1; j++) {
                    int pixel = input[(y + i) * width + (x + j)];
                    gx += pixel * sobel_x_kernel[(i + 1) * 3 + (j + 1)];
                    gy += pixel * sobel_y_kernel[(i + 1) * 3 + (j + 1)];
                }
            }
            int magnitude = min(255, static_cast<int>(sqrtf(gx * gx + gy * gy)));
            output[y * width + x] = static_cast<unsigned char>(magnitude);
        }
    }
}

int main(int argc, char *argv[]) {
    cv::Mat image = cv::imread(argv[1], cv::IMREAD_GRAYSCALE);
    if (image.empty()) {
        std::cerr << "Error: Could not load image!" << std::endl;
        return -1;
    }

    int width = image.cols;
    int height = image.rows;

    unsigned char* h_input = image.data;
    unsigned char* h_output_serial = new unsigned char[width * height];
    unsigned char* h_output_parallel = new unsigned char[width * height];

    auto start_serial = std::chrono::high_resolution_clock::now();
    sobel_filter_serial(h_input, h_output_serial, width, height);
    auto end_serial = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> serial_time = end_serial - start_serial;

    unsigned char *d_input, *d_output;
    cudaMalloc((void**)&d_input, width * height * sizeof(unsigned char));
    cudaMalloc((void**)&d_output, width * height * sizeof(unsigned char));

    cudaMemcpy(d_input, h_input, width * height * sizeof(unsigned char), cudaMemcpyHostToDevice);

    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);

    auto start_parallel = std::chrono::high_resolution_clock::now();
    sobel_filter_kernel<<<gridSize, blockSize>>>(d_input, d_output, width, height);
    cudaDeviceSynchronize();
    auto end_parallel = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> parallel_time = end_parallel - start_parallel;

    cudaMemcpy(h_output_parallel, d_output, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    cv::Mat output_serial(height, width, CV_8UC1, h_output_serial);
    cv::Mat output_parallel(height, width, CV_8UC1, h_output_parallel);
    cv::imwrite("Serial.jpg", output_serial);
    cv::imwrite("Parallel.jpg", output_parallel);

    cudaFree(d_input);
    cudaFree(d_output);
    delete[] h_output_serial;
    delete[] h_output_parallel;

    double speedup = serial_time.count() / parallel_time.count();
    double efficiency = speedup / (blockSize.x * blockSize.y);

    std::cout << "Serial Execution Time: " << serial_time.count() << " seconds" << std::endl;
    std::cout << "Parallel Execution Time: " << parallel_time.count() << " seconds" << std::endl;
    std::cout << "Speedup: " << speedup << std::endl;
    std::cout << "Efficiency: " << efficiency << std::endl;

    return 0;
}


Overwriting main.cu


In [35]:
!nvcc -w main.cu -o main `pkg-config --cflags --libs opencv4`
!chmod 755 main

In [36]:
!./main img2.jpg

Serial Execution Time: 0.469711 seconds
Parallel Execution Time: 0.000852519 seconds
Speedup: 550.969
Efficiency: 2.15222
