In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [3]:
!wget https://raw.githubusercontent.com/Dewjy02/HPC-ASSIGNMENT/b06f2b26f555c88c7d1d50e6e575fc5e1aaeb650/gaussianBoxBlur/lodepng.cpp
!wget https://raw.githubusercontent.com/Dewjy02/HPC-ASSIGNMENT/b06f2b26f555c88c7d1d50e6e575fc5e1aaeb650/gaussianBoxBlur/lodepng.h
!wget https://raw.githubusercontent.com/Dewjy02/HPC-ASSIGNMENT/b06f2b26f555c88c7d1d50e6e575fc5e1aaeb650/gaussianBoxBlur/church.png
!wget https://raw.githubusercontent.com/Dewjy02/HPC-ASSIGNMENT/b06f2b26f555c88c7d1d50e6e575fc5e1aaeb650/gaussianBoxBlur/horse.png

--2025-10-21 21:25:41--  https://raw.githubusercontent.com/Dewjy02/HPC-ASSIGNMENT/b06f2b26f555c88c7d1d50e6e575fc5e1aaeb650/gaussianBoxBlur/lodepng.cpp
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 243691 (238K) [text/plain]
Saving to: ‘lodepng.cpp’


2025-10-21 21:25:42 (15.2 MB/s) - ‘lodepng.cpp’ saved [243691/243691]

--2025-10-21 21:25:42--  https://raw.githubusercontent.com/Dewjy02/HPC-ASSIGNMENT/b06f2b26f555c88c7d1d50e6e575fc5e1aaeb650/gaussianBoxBlur/lodepng.h
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 92033 (90K) [te

In [None]:
%%writefile Image_gaussian_blur.cu
#include <iostream>
#include <vector>
#include "lodepng.h"
#include <cuda_runtime.h>
#include <cmath>

float host_gaussian_weight(int x, int y, float sigma) {
    return exp(-(x*x + y*y) / (2 * sigma * sigma));
}

__global__ void gaussian_blur_kernel(const unsigned char* input, unsigned char* output, int width, int height, int kernel_radius, const float* gaussian_weights) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        float r_sum = 0.0f;
        float g_sum = 0.0f;
        float b_sum = 0.0f;
        float weight_sum = 0.0f;

        for (int j = -kernel_radius; j <= kernel_radius; ++j) {
            for (int i = -kernel_radius; i <= kernel_radius; ++i) {
                int current_x = x + i;
                int current_y = y + j;

                if (current_x >= 0 && current_x < width && current_y >= 0 && current_y < height) {
                    int img_idx = (current_y * width + current_x) * 4;
                    int kernel_idx = (j + kernel_radius) * (2 * kernel_radius + 1) + (i + kernel_radius);
                    float weight = gaussian_weights[kernel_idx];

                    r_sum += input[img_idx] * weight;
                    g_sum += input[img_idx + 1] * weight;
                    b_sum += input[img_idx + 2] * weight;
                    weight_sum += weight;
                }
            }
        }

        int out_idx = (y * width + x) * 4;
        output[out_idx] = static_cast<unsigned char>(r_sum / weight_sum);
        output[out_idx + 1] = static_cast<unsigned char>(g_sum / weight_sum);
        output[out_idx + 2] = static_cast<unsigned char>(b_sum / weight_sum);
        output[out_idx + 3] = input[out_idx + 3]; // Preserve alpha channel
    }
}


int main(int argc, char **argv) {
    if (argc < 4) {
        std::cerr << "Usage: " << argv[0] << " </content/church.png> <output_file.png> <kernel_radius>" << std::endl;
        return 1;
    }

    const char* input_filename = argv[1];
    const char* output_filename = argv[2];
    int kernel_radius = std::stoi(argv[3]);
    float sigma = kernel_radius / 3.0f; // Simple heuristic for sigma

    std::vector<unsigned char> image;
    unsigned width, height;

    unsigned error = lodepng::decode(image, width, height, input_filename);
    if (error) {
        std::cerr << "Decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
        return 1;
    }

    std::cout << "Image loaded: " << width << "x" << height << " pixels." << std::endl;

    size_t image_size = (size_t)width * height * 4;
    unsigned char* d_input, *d_output;
    float* h_gaussian_weights;
    float* d_gaussian_weights;

    cudaError_t cuda_status = cudaMalloc(&d_input, image_size);
    if (cuda_status != cudaSuccess) {
        std::cerr << "cudaMalloc failed for d_input: " << cudaGetErrorString(cuda_status) << std::endl;
        return 1;
    }

    cuda_status = cudaMalloc(&d_output, image_size);
     if (cuda_status != cudaSuccess) {
        std::cerr << "cudaMalloc failed for d_output: " << cudaGetErrorString(cuda_status) << std::endl;
        cudaFree(d_input);
        return 1;
    }

    int kernel_size = (2 * kernel_radius + 1);
    int gaussian_weights_size = kernel_size * kernel_size * sizeof(float);

    h_gaussian_weights = (float*)malloc(gaussian_weights_size);
    if (h_gaussian_weights == nullptr) {
        std::cerr << "Failed to allocate host memory for Gaussian weights." << std::endl;
        cudaFree(d_input);
        cudaFree(d_output);
        return 1;
    }

    float sum_weights = 0.0f;
    for (int j = -kernel_radius; j <= kernel_radius; ++j) {
        for (int i = -kernel_radius; i <= kernel_radius; ++i) {
            int kernel_idx = (j + kernel_radius) * kernel_size + (i + kernel_radius);
            h_gaussian_weights[kernel_idx] = host_gaussian_weight(i, j, sigma);
            sum_weights += h_gaussian_weights[kernel_idx];
        }
    }

    // Normalize weights
    for (int j = -kernel_radius; j <= kernel_radius; ++j) {
        for (int i = -kernel_radius; i <= kernel_radius; ++i) {
             int kernel_idx = (j + kernel_radius) * kernel_size + (i + kernel_radius);
             h_gaussian_weights[kernel_idx] /= sum_weights;
        }
    }


    cuda_status = cudaMalloc(&d_gaussian_weights, gaussian_weights_size);
     if (cuda_status != cudaSuccess) {
        std::cerr << "cudaMalloc failed for d_gaussian_weights: " << cudaGetErrorString(cuda_status) << std::endl;
        cudaFree(d_input);
        cudaFree(d_output);
        free(h_gaussian_weights);
        return 1;
    }


    cuda_status = cudaMemcpy(d_input, image.data(), image_size, cudaMemcpyHostToDevice);
    if (cuda_status != cudaSuccess) {
        std::cerr << "cudaMemcpyHostToDevice failed for image data: " << cudaGetErrorString(cuda_status) << std::endl;
        cudaFree(d_input);
        cudaFree(d_output);
        cudaFree(d_gaussian_weights);
        free(h_gaussian_weights);
        return 1;
    }

    cuda_status = cudaMemcpy(d_gaussian_weights, h_gaussian_weights, gaussian_weights_size, cudaMemcpyHostToDevice);
     if (cuda_status != cudaSuccess) {
        std::cerr << "cudaMemcpyHostToDevice failed for gaussian weights: " << cudaGetErrorString(cuda_status) << std::endl;
        cudaFree(d_input);
        cudaFree(d_output);
        cudaFree(d_gaussian_weights);
        free(h_gaussian_weights);
        return 1;
    }


    dim3 threads_per_block(16, 16);
    dim3 num_blocks((width + threads_per_block.x - 1) / threads_per_block.x,
                    (height + threads_per_block.y - 1) / threads_per_block.y);

    gaussian_blur_kernel<<<num_blocks, threads_per_block>>>(d_input, d_output, width, height, kernel_radius, d_gaussian_weights);

    cuda_status = cudaGetLastError();
     if (cuda_status != cudaSuccess) {
        std::cerr << "Kernel launch failed: " << cudaGetErrorString(cuda_status) << std::endl;
        cudaFree(d_input);
        cudaFree(d_output);
        cudaFree(d_gaussian_weights);
        free(h_gaussian_weights);
        return 1;
    }

    cuda_status = cudaDeviceSynchronize();
     if (cuda_status != cudaSuccess) {
        std::cerr << "cudaDeviceSynchronize failed: " << cudaGetErrorString(cuda_status) << std::endl;
        cudaFree(d_input);
        cudaFree(d_output);
        cudaFree(d_gaussian_weights);
        free(h_gaussian_weights);
        return 1;
    }


    std::vector<unsigned char> output_image(image_size);
    cuda_status = cudaMemcpy(output_image.data(), d_output, image_size, cudaMemcpyDeviceToHost);
    if (cuda_status != cudaSuccess) {
        std::cerr << "cudaMemcpyDeviceToHost failed: " << cudaGetErrorString(cuda_status) << std::endl;
        cudaFree(d_input);
        cudaFree(d_output);
        cudaFree(d_gaussian_weights);
        free(h_gaussian_weights);
        return 1;
    }

    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_gaussian_weights);
    free(h_gaussian_weights);


    error = lodepng::encode(output_filename, output_image, width, height);
    if (error) {
        std::cerr << "Encoder error " << error << ": " << lodepng_error_text(error) << std::endl;
        return 1;
    }

    std::cout << "Blurred image saved as " << output_filename << std::endl;

    return 0;
}

Writing Image_gaussian_blur.cu


In [5]:
!nvcc -arch=sm_75 Image_gaussian_blur.cu lodepng.cpp -o Image_gaussian_blur
!ls -l

total 14984
-rw-r--r-- 1 root root 9123652 Oct 21 21:25 church.png
-rw-r--r-- 1 root root 4687937 Oct 21 21:25 horse.png
-rwxr-xr-x 1 root root 1172824 Oct 21 21:26 Image_gaussian_blur
-rw-r--r-- 1 root root    7612 Oct 21 21:25 Image_gaussian_blur.cu
-rw-r--r-- 1 root root  243691 Oct 21 21:25 lodepng.cpp
-rw-r--r-- 1 root root   92033 Oct 21 21:25 lodepng.h
drwxr-xr-x 1 root root    4096 Oct 20 20:02 sample_data


In [6]:
!./Image_gaussian_blur /content/horse.png /content/blurred_horse.png 50

Image loaded: 2318x3000 pixels.
Blurred image saved as /content/blurred_horse.png
