In [1]:
!nvidia-smi


Sun Nov 30 15:06:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   61C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!apt-get update
!apt-get install -y libpng-dev

Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.6 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,153 kB]
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,491 kB]
Get:12 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/main a

In [8]:
%%writefile image_filters_working_2gpu.cu
#include <iostream>
#include <vector>
#include <chrono>
#include <png.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16

// ---- constant kernels ----
__constant__ int K_BLUR[9]    = {1,1,1,1,1,1,1,1,1};
__constant__ int K_EDGE[9]    = {-1,-1,-1,-1,8,-1,-1,-1,-1};
__constant__ int K_DENOISE[9] = {1,1,1,1,1,1,1,1,1};

// ---------------- small helpers ----------------
__host__ __device__ inline int clamp_int(int v, int lo, int hi){
    if(v < lo) return lo;
    if(v > hi) return hi;
    return v;
}
__host__ __device__ inline int sat_coord(int x, int maxv){
    if(x < 0) return 0;
    if(x > maxv) return maxv;
    return x;
}

// ---------------- CPU FILTERS (твои, без изменений) ----------------
void cpu_blur(const std::vector<unsigned char>& in,std::vector<unsigned char>& out,int w,int h,int c){
    int kernel[3][3]={{1,1,1},{1,1,1},{1,1,1}}, ksum=9;
    for(int i=0;i<w*h*c;i++) out[i]=in[i];
    for(int y=1;y<h-1;y++)
        for(int x=1;x<w-1;x++)
            for(int ch=0;ch<c;ch++){
                int sum=0;
                for(int ky=-1;ky<=1;ky++)
                    for(int kx=-1;kx<=1;kx++)
                        sum += in[((y+ky)*w + (x+kx))*c + ch] * kernel[ky+1][kx+1];
                out[(y*w+x)*c+ch]=sum/ksum;
            }
}

void cpu_edge(const std::vector<unsigned char>& in,std::vector<unsigned char>& out,int w,int h,int c){
    int kernel[3][3]={{-1,-1,-1},{-1,8,-1},{-1,-1,-1}};
    for(int i=0;i<w*h*c;i++) out[i]=in[i];
    for(int y=1;y<h-1;y++)
        for(int x=1;x<w-1;x++)
            for(int ch=0;ch<c;ch++){
                int sum=0;
                for(int ky=-1;ky<=1;ky++)
                    for(int kx=-1;kx<=1;kx++)
                        sum += in[((y+ky)*w + (x+kx))*c + ch] * kernel[ky+1][kx+1];
                int val = clamp_int(sum+128,0,255);
                out[(y*w+x)*c+ch] = val;
            }
}

void cpu_denoise(const std::vector<unsigned char>& in,std::vector<unsigned char>& out,int w,int h,int c){
    int kernel[3][3]={{1,1,1},{1,1,1},{1,1,1}}, ksum=9;
    for(int i=0;i<w*h*c;i++) out[i]=in[i];
    for(int y=1;y<h-1;y++)
        for(int x=1;x<w-1;x++)
            for(int ch=0;ch<c;ch++){
                int sum=0;
                for(int ky=-1;ky<=1;ky++)
                    for(int kx=-1;kx<=1;kx++)
                        sum += in[((y+ky)*w + (x+kx))*c + ch] * kernel[ky+1][kx+1];
                out[(y*w+x)*c+ch]=sum/ksum;
            }
}

// --------------- ТВОИ CUDA ЯДРА — оставлены БЕЗ изменений ----------------
__global__ void gpu_filter_global(const unsigned char* __restrict__ in, unsigned char* out,
                                  int w, int h, int c, int ftype)
{
    int x=blockIdx.x*blockDim.x+threadIdx.x;
    int y=blockIdx.y*blockDim.y+threadIdx.y;
    if(x<=0||y<=0||x>=w-1||y>=h-1) return;

    const int* K = (ftype==0?K_BLUR:(ftype==1?K_EDGE:K_DENOISE));
    int ksum = (ftype==1?1:9);

    for(int ch=0; ch<c; ch++){
        int sum=0, idx=0;
        for(int ky=-1;ky<=1;ky++)
            for(int kx=-1;kx<=1;kx++)
                sum += in[((y+ky)*w + (x+kx))*c + ch] * K[idx++];

        int v = (ftype==1?sum+128:sum/ksum);
        out[(y*w+x)*c+ch] = clamp_int(v,0,255);
    }
}

__global__ void gpu_filter_shared(const unsigned char* __restrict__ in, unsigned char* out,
                                  int w, int h, int c, int ftype)
{
    __shared__ unsigned char tile[BLOCK_SIZE+2][BLOCK_SIZE+2][4];

    int tx=threadIdx.x, ty=threadIdx.y;
    int x=blockIdx.x*blockDim.x+tx;
    int y=blockIdx.y*blockDim.y+ty;

    int xx=sat_coord(x,w-1);
    int yy=sat_coord(y,h-1);

    for(int ch=0;ch<c;ch++)
        tile[ty+1][tx+1][ch] = in[(yy*w+xx)*c+ch];

    if(tx==0)
        for(int ch=0;ch<c;ch++)
            tile[ty+1][0][ch] = in[(yy*w+sat_coord(xx-1,w-1))*c+ch];

    if(tx==blockDim.x-1)
        for(int ch=0;ch<c;ch++)
            tile[ty+1][tx+2][ch] = in[(yy*w+sat_coord(xx+1,w-1))*c+ch];

    if(ty==0)
        for(int ch=0;ch<c;ch++)
            tile[0][tx+1][ch] = in[(sat_coord(yy-1,h-1)*w+xx)*c+ch];

    if(ty==blockDim.y-1)
        for(int ch=0;ch<c;ch++)
            tile[ty+2][tx+1][ch] = in[(sat_coord(yy+1,h-1)*w+xx)*c+ch];

    __syncthreads();

    if(x<=0||y<=0||x>=w-1||y>=h-1){
        if(x<w && y<h)
            for(int ch=0;ch<c;ch++)
                out[(y*w+x)*c+ch] = in[(y*w+x)*c+ch];
        return;
    }

    const int* K = (ftype==0?K_BLUR:(ftype==1?K_EDGE:K_DENOISE));
    int ksum = (ftype==1?1:9);

    for(int ch=0;ch<c;ch++){
        int sum=0, idx=0;
        for(int ky=-1;ky<=1;ky++)
            for(int kx=-1;kx<=1;kx++)
                sum += tile[ty+1+ky][tx+1+kx][ch] * K[idx++];
        int v=(ftype==1?sum+128:sum/ksum);
        out[(y*w+x)*c+ch] = clamp_int(v,0,255);
    }
}

// ---------------- PNG LOAD (ты не менял — тоже оставил прежним) ----------------
bool load_png(const char* f,std::vector<unsigned char>& img,int &w,int &h,int &c){
    FILE* fp=fopen(f,"rb"); if(!fp){ std::cerr<<"Cannot open "<<f<<"\n"; return false; }
    png_structp png=png_create_read_struct(PNG_LIBPNG_VER_STRING,nullptr,nullptr,nullptr);
    png_infop info=png_create_info_struct(png);
    if(setjmp(png_jmpbuf(png))) return false;
    png_init_io(png,fp);
    png_read_info(png,info);
    w=png_get_image_width(png,info);
    h=png_get_image_height(png,info);
    png_byte color_type=png_get_color_type(png,info);
    png_byte bit_depth=png_get_bit_depth(png,info);
    if(bit_depth==16) png_set_strip_16(png);
    if(color_type==PNG_COLOR_TYPE_PALETTE) png_set_palette_to_rgb(png);
    if(color_type==PNG_COLOR_TYPE_GRAY && bit_depth<8)
        png_set_expand_gray_1_2_4_to_8(png);
    if(png_get_valid(png,info,PNG_INFO_tRNS))
        png_set_tRNS_to_alpha(png);
    png_set_filler(png,0xFF,PNG_FILLER_AFTER);
    c=4;
    img.resize((size_t)w*h*c);
    std::vector<png_bytep> rows(h);
    for(int y=0;y<h;y++) rows[y]=&img[y*w*c];
    png_read_image(png,rows.data());
    fclose(fp);
    png_destroy_read_struct(&png,&info,nullptr);
    return true;
}

// ------------------------ MAIN (твоя логика + добавлен 2-GPU) ------------------------
int main(){
    std::vector<unsigned char> img;
    int w,h,c;
    if(!load_png("input.png",img,w,h,c)){
        std::cerr<<"Cannot open input.png\n";
        return -1;
    }

    std::vector<unsigned char> out(img.size());
    size_t bytes = img.size();

    int half_h = h/2;
    size_t half_bytes = (size_t)w * half_h * c;

    unsigned char *d0_in,*d0_out,*d1_in,*d1_out;

    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((w + BLOCK_SIZE - 1)/BLOCK_SIZE,
              (half_h + BLOCK_SIZE - 1)/BLOCK_SIZE);

    // -------- GPU 0 -----------
    cudaSetDevice(0);
    cudaMalloc(&d0_in,  half_bytes);
    cudaMalloc(&d0_out, half_bytes);
    cudaMemcpy(d0_in, img.data(), half_bytes, cudaMemcpyHostToDevice);

    // -------- GPU 1 -----------
    cudaSetDevice(1);
    cudaMalloc(&d1_in,  half_bytes);
    cudaMalloc(&d1_out, half_bytes);
    cudaMemcpy(d1_in, img.data()+half_bytes, half_bytes, cudaMemcpyHostToDevice);

    std::string filters[3]={"blur","edge","denoise"};
    double times[3][3]={};

    cudaEvent_t start_ev,stop_ev;
    cudaEventCreate(&start_ev);
    cudaEventCreate(&stop_ev);

    gpu_filter_global<<<grid,block>>>(d0_in,d0_out,0,0,0,0);

    for(int f=0;f<3;f++){

        // CPU
        auto t0=std::chrono::high_resolution_clock::now();
        if(f==0) cpu_blur(img,out,w,h,c);
        else if(f==1) cpu_edge(img,out,w,h,c);
        else cpu_denoise(img,out,w,h,c);
        auto t1=std::chrono::high_resolution_clock::now();
        times[f][0]=std::chrono::duration<double,std::milli>(t1-t0).count();

        // GPU global memory, TWO GPUs
        cudaEventRecord(start_ev);

        cudaSetDevice(0);
        gpu_filter_global<<<grid,block>>>(d0_in,d0_out,w,half_h,c,f);

        cudaSetDevice(1);
        gpu_filter_global<<<grid,block>>>(d1_in,d1_out,w,half_h,c,f);

        cudaEventRecord(stop_ev);
        cudaEventSynchronize(stop_ev);
        float gtime=0;
        cudaEventElapsedTime(&gtime,start_ev,stop_ev);
        times[f][1]=gtime;

        cudaMemcpy(out.data(), d0_out, half_bytes, cudaMemcpyDeviceToHost);
        cudaMemcpy(out.data()+half_bytes, d1_out, half_bytes, cudaMemcpyDeviceToHost);

        // GPU shared memory, TWO GPUs
        cudaEventRecord(start_ev);

        cudaSetDevice(0);
        gpu_filter_shared<<<grid,block>>>(d0_in,d0_out,w,half_h,c,f);

        cudaSetDevice(1);
        gpu_filter_shared<<<grid,block>>>(d1_in,d1_out,w,half_h,c,f);

        cudaEventRecord(stop_ev);
        cudaEventSynchronize(stop_ev);
        gtime=0;
        cudaEventElapsedTime(&gtime,start_ev,stop_ev);
        times[f][2]=gtime;

        cudaMemcpy(out.data(), d0_out, half_bytes, cudaMemcpyDeviceToHost);
        cudaMemcpy(out.data()+half_bytes, d1_out, half_bytes, cudaMemcpyDeviceToHost);
    }

    cudaFree(d0_in); cudaFree(d0_out);
    cudaFree(d1_in); cudaFree(d1_out);

    std::cout<<"\n=== Timing Table (ms) ===\n";
    std::cout<<"Filter\tCPU\tGPU_Global\tGPU_Shared\n";
    for(int f=0;f<3;f++){
        std::cout<<filters[f]<<"\t"<<times[f][0]<<"\t"<<times[f][1]<<"\t"<<times[f][2]<<"\n";
    }

    return 0;
}


Overwriting image_filters_working_2gpu.cu


In [9]:
!nvcc image_filters_working_2gpu.cu -o image_filters \
    -lpng16 \
    -O3

In [11]:
!./image_filters


=== Timing Table (ms) ===
Filter	CPU	GPU_Global	GPU_Shared
blur	318.101	0.008544	0.00688
edge	395.917	0.009728	0.007104
denoise	319.603	0.010368	0.006432
