<a href="https://colab.research.google.com/github/AshmitB05/cuda-parallel-programming-ece408/blob/main/Histogram_Equalization_CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpr_s7z60c".


In [None]:
# Detect selected GPU and its NVIDA architecture:
import subprocess
gpu_info = subprocess.getoutput("nvidia-smi --query-gpu=name,compute_cap --format=csv,noheader,nounits")
if "not found" in gpu_info.lower(): raise RuntimeError("Error: No GPU found. Please select a GPU runtime environment.")
gpu_name, compute_cap = map(str.strip, gpu_info.split(','))
gpu_arch = f"sm_{compute_cap.replace('.', '')}"

print(f"{'GPU Name':<15}: {gpu_name}")
print(f"{'Architecture':<15}: {gpu_arch}")

GPU Name       : Tesla T4
Architecture   : sm_75


In [None]:
%%cuda -c "--gpu-architecture $gpu_arch"
#include <stdio.h>
#include <stdlib.h>
__global__
void kernel1(unsigned char* red,unsigned char* green,unsigned char* blue,unsigned char* output,int width,int height){
    int row=blockIdx.y*blockDim.y+threadIdx.y;
    int col=blockIdx.x*blockDim.x+threadIdx.x;
    if(row<height && col<width){
        output[row*width+col]=(0.21*int(red[row*width+col]) + 0.71*int(green[row*width+col]) + 0.07*int(blue[row*width+col]));
    }
}
__global__
void kernel2(unsigned char* input,int* output,int width,int height){
    __shared__ int shared[256];
    shared[threadIdx.x]=0;
    __syncthreads();
    int idx=blockIdx.x*blockDim.x+threadIdx.x;
    if(idx<(width*height)){
    atomicAdd(&shared[int(input[idx])],1);}
    __syncthreads();
    atomicAdd(&output[threadIdx.x],shared[threadIdx.x]);

}


int main() {
    FILE* fp = fopen("input.ppm", "rb");

    char magic[3];
    fscanf(fp, "%2s", magic);

    if (magic[0] != 'P' || magic[1] != '6') {
        printf("Not a P6 file!\n");
        return 0;
    }

    // Skip whitespace
    int c = fgetc(fp);
    while (c == '\n' || c == ' ' || c == '\r' || c == '\t')
        c = fgetc(fp);
    ungetc(c, fp);

    // Skip comment line
    c = fgetc(fp);
    if (c == '#') {
        while (fgetc(fp) != '\n');
    } else {
        ungetc(c, fp);
    }

    int width, height, maxval;
    fscanf(fp, "%d %d", &width, &height);
    fscanf(fp, "%d", &maxval);

    // Skip single whitespace after maxval
    fgetc(fp);

    int size = width * height * 3;
    unsigned char *data = (unsigned char*)malloc(size);

    fread(data, 1, size, fp);
    fclose(fp);

    printf("Width: %d\n", width);
    printf("Height: %d\n", height);
    printf("Maxval: %d\n", maxval);

    unsigned char red[width*height];
    unsigned char green[width*height];
    unsigned char blue[width*height];
    for(int i=0;i<size;i++){
        if(i%3==0){
            red[i/3]=data[i];
        }
        else if(i%3==1){
            green[i/3]=data[i];
        }
        else{
            blue[i/3]=data[i];
        }
    }
    printf("%d %d %d",red[0],green[0],blue[0]);
    unsigned char* red_d,*green_d,*blue_d;
    unsigned char output[width*height];
    unsigned char* output_gray;
    cudaMalloc((void**)&red_d,width*height*sizeof(unsigned char));
    cudaMalloc((void**)&green_d,width*height*sizeof(unsigned char));
    cudaMalloc((void**)&blue_d,width*height*sizeof(unsigned char));
    cudaMalloc((void**)&output_gray,width*height*sizeof(unsigned char));
    cudaMemcpy(red_d,red,width*height*sizeof(unsigned char),cudaMemcpyHostToDevice);
    cudaMemcpy(green_d,green,width*height*sizeof(unsigned char),cudaMemcpyHostToDevice);
    cudaMemcpy(blue_d,blue,width*height*sizeof(unsigned char),cudaMemcpyHostToDevice);
    dim3 grid(ceil(width/16.0),ceil(height/16.0),1);
    dim3 block(16,16,1);
    kernel1<<<grid,block>>>(red_d,green_d,blue_d,output_gray,width,height);
    cudaMemcpy(output,output_gray,width*height*sizeof(unsigned char),cudaMemcpyDeviceToHost);
    FILE* fout = fopen("gray.ppm", "wb");

    fprintf(fout, "P6\n%d %d\n255\n", width, height);

   for (int i = 0; i < width*height; i++) {
    unsigned char g = output[i];
    fputc(g, fout); // R
    fputc(g, fout); // G
    fputc(g, fout); // B
    }

    fclose(fout);
    int output2[256];
    unsigned char *output_gray2;
    int *output_to_histo;
    cudaMalloc((void**)&output_gray2,width*height*sizeof(unsigned char));
    cudaMalloc((void**)&output_to_histo,256*sizeof(int));
    cudaMemcpy(output_gray2,output,width*height*sizeof(unsigned char),cudaMemcpyHostToDevice);
    dim3 grid2(ceil((width*height)/256),1,1);
    dim3 block2(256,1,1);
    kernel2<<<grid2,block2>>>(output_gray2,output_to_histo,width,height);
    cudaMemcpy(output2,output_to_histo,256*sizeof(int),cudaMemcpyDeviceToHost);
    for(int i=0;i<256;i++){
        printf("%d ",output2[i]);
    }
    float p[256];
    for(int i=0;i<256;i++){
        p[i] = (float)output2[i] / (float)(width * height);
    }
    float cdf[256];
    cdf[0]=p[0];
    for(int i=1;i<256;i++){
        cdf[i]=cdf[i-1]+p[i];
    }

    float min_cdf = cdf[0];
    for(int i=1; i<256; i++){
        if(cdf[i] > 0.00001f && cdf[i] < min_cdf){
             min_cdf = cdf[i];
        }
    }
    unsigned char lut[256];
    for(int i=0; i<256; i++){
        float val = 255.0f * (cdf[i] - min_cdf) / (1.0f - min_cdf);
        if(val < 0) val = 0;
        if(val > 255) val = 255;
        lut[i] = (unsigned char)val;
    }
    unsigned char* final_img = (unsigned char*)malloc(width * height * 3);

    for(int i=0; i < width * height; i++) {
        // Apply equalized map to RGB channels
        unsigned char r_new = lut[red[i]];
        unsigned char g_new = lut[green[i]];
        unsigned char b_new = lut[blue[i]];

        // Store in interleaved format (R, G, B, R, G, B...)
        final_img[i*3 + 0] = r_new;
        final_img[i*3 + 1] = g_new;
        final_img[i*3 + 2] = b_new;
    }
    FILE* fout1 = fopen("output_equalized.ppm", "wb");
    fprintf(fout1, "P6\n%d %d\n255\n", width, height);
    fwrite(final_img, 1, width * height * 3, fout1);
    fclose(fout1);

    printf("Done! Saved to output_equalized.ppm\n");




    free(data);
    return 0;
}


Width: 314
Height: 512
Maxval: 255
66 90 581 5 2 7 6 8 13 15 20 23 17 28 30 41 31 37 45 50 39 32 63 65 49 72 72 76 89 79 98 121 123 107 112 136 174 154 184 186 184 176 218 222 255 306 315 363 365 419 418 431 449 538 570 614 604 592 549 596 590 613 640 658 655 649 671 739 681 714 792 795 788 837 886 908 890 923 1006 1016 1000 1033 1057 1132 1103 1167 1213 1227 1297 1355 1330 1329 1347 1434 1474 1525 1519 1511 1584 1549 1669 1689 1670 1754 1619 1736 1816 1706 1891 1798 1928 1890 1876 1974 1951 1848 1887 1908 1890 1858 1897 1913 1908 1891 1826 1856 1864 1813 1731 1687 1677 1541 1600 1525 1530 1370 1393 1336 1358 1332 1213 1260 1270 1159 1123 1002 1014 971 969 918 896 914 859 830 884 760 777 759 719 705 643 668 669 613 599 625 586 515 500 488 439 378 376 358 335 228 244 214 190 150 161 141 126 103 119 95 103 81 104 79 71 70 72 80 67 71 49 61 59 42 60 51 48 34 44 44 46 32 33 46 33 47 51 37 38 53 44 40 50 48 45 67 73 64 54 57 63 41 53 44 36 30 48 28 42 37 41 40 32 32 30 32 36 31 66 102 295 2

In [None]:
!ls /content


gray.ppm  input.ppm  output_equalized.ppm  sample_data
