In [1]:
%%writefile cuda_stuff.cuh
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

#ifndef cuda_stuff_H
#define cuda_stuff_H

//MACRO TO DEBUG CUDA FUNCTIONS
/** Error checking,
 *  taken from https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
 */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#endif

Writing cuda_stuff.cuh


# Convolution CPU


In [2]:
%%writefile convolution.cu
#include <sys/time.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#include "cuda_stuff.cuh"

static int N=100;
#define NB_STEPS 50

double **cur_step;
double **next_step;

double weight [3][3]={{0,2,0}, {2,4,2}, {0,2,0}};

/* initialize matrixes */
void init() {
  int i;

  cur_step = (double **)malloc(sizeof(double*) * N);
  next_step = (double **)malloc(sizeof(double*) * N);

  srand(0);
  for(i=0; i<N; i++) {
    cur_step[i] = (double *)calloc(N, sizeof(double));
    next_step[i] = (double *)calloc(N, sizeof(double));
  }

  int nb_hot_spots= rand()%N;
  printf("Generating %d hot spots\n", nb_hot_spots);
  for(i=0; i<nb_hot_spots; i++) {
    int posx = rand()%N;
    int posy = rand()%N;
    cur_step[posx][posy] += N*10000;
    printf("%d,%d : %lf\n", posx, posy, cur_step[posx][posy]);
  }
}

/* dump the matrix in f */
void print_matrix(FILE* f,double** matrix) {
  int i, j;
  for(i=1; i<N-1; i++) {
    for(j=1; j<N-1; j++) {
      fprintf(f, " %.2f  ", matrix[i][j]);
    }
    fprintf(f, "\n");
  }
}

void compute() {
  int i, j;

  for(i=1; i< N-1; i++) {
    for(j=1; j< N-1; j++) {
      next_step[i][j] = (cur_step[i-1][j] * weight[0][1]
			+ cur_step[i+1][j] * weight[2][1]
			+ cur_step[i][j-1] * weight[1][0]
			+ cur_step[i][j+1] * weight[1][2]
			+ cur_step[i][j] * weight[1][1] ) / 5;
    }
  }

  /* swap buffers */
  double ** tmp = cur_step;
  cur_step = next_step;
  next_step = tmp;
}


int main(int argc, char**argv) {
  char* output_file="result.dat";
  int dump=1;

  struct timeval t1, t2;
  init();

  gettimeofday(&t1, NULL);
  for(int i=0; i< NB_STEPS; i++) {
    printf("STEP %d...\n", i);
    compute();
  }
  gettimeofday(&t2, NULL);

  double total_time = ((t2.tv_sec-t1.tv_sec)*1e6 + (t2.tv_usec - t1.tv_usec))/1e6;
  printf("%d steps in %lf sec (%lf sec/step)\n", NB_STEPS, total_time, total_time/NB_STEPS);

  if(dump){
    printf("dumping the result data in %s\n", output_file);
    FILE *f = fopen(output_file, "w");
    print_matrix(f, cur_step);
  } else {
    print_matrix(stdout, cur_step);
  }

  return EXIT_SUCCESS;
}

Writing convolution.cu


# Run CPU


In [3]:
!nvcc convolution.cu -o convolution

    char* output_file="result.dat";
                      ^


    char* output_file="result.dat";
                      ^




In [4]:
! ./convolution

Generating 83 hot spots
86,77 : 1000000.000000
15,93 : 1000000.000000
35,86 : 1000000.000000
92,49 : 1000000.000000
21,62 : 1000000.000000
27,90 : 1000000.000000
59,63 : 1000000.000000
26,40 : 1000000.000000
26,72 : 1000000.000000
36,11 : 1000000.000000
68,67 : 1000000.000000
29,82 : 1000000.000000
30,62 : 1000000.000000
23,67 : 1000000.000000
35,29 : 1000000.000000
2,22 : 1000000.000000
58,69 : 1000000.000000
67,93 : 1000000.000000
56,11 : 1000000.000000
42,29 : 1000000.000000
73,21 : 1000000.000000
19,84 : 1000000.000000
37,98 : 1000000.000000
24,15 : 1000000.000000
70,13 : 1000000.000000
26,91 : 1000000.000000
80,56 : 1000000.000000
73,62 : 1000000.000000
70,96 : 1000000.000000
81,5 : 1000000.000000
25,84 : 1000000.000000
27,36 : 1000000.000000
5,46 : 1000000.000000
29,13 : 1000000.000000
57,24 : 1000000.000000
95,82 : 1000000.000000
45,14 : 1000000.000000
67,34 : 1000000.000000
64,43 : 1000000.000000
50,87 : 1000000.000000
8,76 : 1000000.000000
78,88 : 1000000.000000
84,3 : 1000000

In [5]:
! apt-get install gnuplot

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  aglfn gnuplot-data gnuplot-qt libevdev2 libgudev-1.0-0 libinput-bin libinput10 liblua5.4-0
  libmd4c0 libmtdev1 libnotify4 libqt5core5a libqt5dbus5 libqt5gui5 libqt5network5
  libqt5printsupport5 libqt5svg5 libqt5widgets5 libwacom-bin libwacom-common libwacom9
  libwxbase3.0-0v5 libwxgtk3.0-gtk3-0v5 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1 libxkbcommon-x11-0
  qt5-gtk-platformtheme qttranslations5-l10n
Suggested packages:
  gnuplot-doc gnome-shell | notification-daemon qt5-image-formats-plugins qtwayland5
The following NEW packages will be installed:
  aglfn gnuplot gnuplot-data gnuplot-qt libevdev2 libgudev-1.0-0 libinput-bin libinput10
  liblua5.4-0 libmd4c0 libmtdev1 libnotify4 libqt5core5a libqt5dbus5 libqt5gui5 libqt5network5
  libqt5printsupport5 lib

In [6]:
! wget http://www-inf.telecom-sudparis.eu/COURS/IA704/plot.gp

--2025-02-03 16:41:47--  http://www-inf.telecom-sudparis.eu/COURS/IA704/plot.gp
Resolving www-inf.telecom-sudparis.eu (www-inf.telecom-sudparis.eu)... 157.159.52.20
Connecting to www-inf.telecom-sudparis.eu (www-inf.telecom-sudparis.eu)|157.159.52.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 96
Saving to: ‘plot.gp’


2025-02-03 16:41:49 (13.9 MB/s) - ‘plot.gp’ saved [96/96]



In [7]:
! gnuplot -p plot.gp

# Convolution GPU


In [8]:
%%writefile convolution_gpu.cu
#include <sys/time.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

#include "cuda_stuff.cuh"

static int N=100;
#define NB_STEPS 50

double **cur_step;
double **next_step;
double *cur_d;
double *next_d;

double weight [3][3]={{0,2,0}, {2,4,2}, {0,2,0}};
__constant__ double D_WEIGHT[3][3];


////////////////////////////////////////////////////////////////
//     Utils
////////////////////////////////////////////////////////////////
/* initialize matrixes */
void init() {
  int i;

  cur_step = (double **)malloc(sizeof(double*) * N);
  next_step = (double **)malloc(sizeof(double*) * N);
  gpuErrchk(cudaMalloc((void**) &cur_d, sizeof(double) * N * N));
  gpuErrchk(cudaMalloc((void**) &next_d, sizeof(double) * N * N));
  gpuErrchk(cudaMemcpyToSymbol(D_WEIGHT, weight, 3*3*sizeof(double)));

  srand(0);
  for(i=0; i<N; i++) {
    cur_step[i] = (double *)calloc(N, sizeof(double));
    next_step[i] = (double *)calloc(N, sizeof(double));
  }

  int nb_hot_spots= rand()%N;
  printf("Generating %d hot spots\n", nb_hot_spots);
  for(i=0; i<nb_hot_spots; i++) {
    int posx = rand()%N;
    int posy = rand()%N;
    cur_step[posx][posy] += N*10000;
    printf("%d,%d : %lf\n", posx, posy, cur_step[posx][posy]);
  }
}


/* dump the matrix in f */
void print_matrix(FILE* f,double** matrix) {
  int i, j;
  for(i=1; i<N-1; i++) {
    for(j=1; j<N-1; j++) {
      fprintf(f, " %.2f  ", matrix[i][j]);
    }
    fprintf(f, "\n");
  }
}


////////////////////////////////////////////////////////////////
//     Kernel convolution function
////////////////////////////////////////////////////////////////
__global__ void compute_gpu(double *cur_d, double *next_d, int N) {
  // CHECK IF ROW OR COLUMN MAJOR
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if(idx < N * N) {
    int i = idx / N;
    int j = idx % N;

    next_d[idx] = (cur_d[((i - 1) * N) + j] * D_WEIGHT[0][1]
      + cur_d[((i + 1) * N) + j] * D_WEIGHT[2][1]
      + cur_d[((i * N) + j - 1)] * D_WEIGHT[1][0]
      + cur_d[((i * N) + j + 1)] * D_WEIGHT[1][2]
      + cur_d[idx] * D_WEIGHT[1][1] ) / 5;

    //cur_d[idx] = next_d[idx];
  }
}


////////////////////////////////////////////////////////////////
//     Main program
////////////////////////////////////////////////////////////////
int main(int argc, char**argv) {
  char* output_file="result.dat";
  int dump=1;
  int blocksize = 1024;
  dim3 block(blocksize);
  dim3 grid(ceil(N*N / (float)blocksize));

  struct timeval t1, t2;
  /* Initialize cpu and gpu mem buffers */
  init();

  /* Move cpu data to gpu buf */
  for (int i = 0; i < N; i++)
    gpuErrchk(cudaMemcpy(&cur_d[i*N], cur_step[i], sizeof(double) * N, cudaMemcpyHostToDevice));

  gettimeofday(&t1, NULL);
  for(int i=0; i< NB_STEPS; i++) {
    printf("STEP %d...\n", i);
    compute_gpu<<<grid, block>>>(cur_d, next_d, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(cur_d, next_d, sizeof(double) * N * N, cudaMemcpyDeviceToDevice));
  }
  gettimeofday(&t2, NULL);

  for (int i = 0; i < N; i++)
    gpuErrchk(cudaMemcpy(cur_step[i], &cur_d[i*N], sizeof(double) * N, cudaMemcpyDeviceToHost));

  double total_time = ((t2.tv_sec-t1.tv_sec)*1e6 + (t2.tv_usec - t1.tv_usec))/1e6;
  printf("%d steps in %lf sec (%lf sec/step)\n", NB_STEPS, total_time, total_time/NB_STEPS);

  if(dump){
    printf("dumping the result data in %s\n", output_file);
    FILE *f = fopen(output_file, "w");
    print_matrix(f, cur_step);
  } else {
    print_matrix(stdout, cur_step);
  }

  return EXIT_SUCCESS;
}

Writing convolution_gpu.cu


# Run GPU


In [9]:
!nvcc -arch=sm_75 convolution_gpu.cu -o convolution_gpu

    char* output_file="result.dat";
                      ^


    char* output_file="result.dat";
                      ^




In [10]:
! ./convolution_gpu

Generating 83 hot spots
86,77 : 1000000.000000
15,93 : 1000000.000000
35,86 : 1000000.000000
92,49 : 1000000.000000
21,62 : 1000000.000000
27,90 : 1000000.000000
59,63 : 1000000.000000
26,40 : 1000000.000000
26,72 : 1000000.000000
36,11 : 1000000.000000
68,67 : 1000000.000000
29,82 : 1000000.000000
30,62 : 1000000.000000
23,67 : 1000000.000000
35,29 : 1000000.000000
2,22 : 1000000.000000
58,69 : 1000000.000000
67,93 : 1000000.000000
56,11 : 1000000.000000
42,29 : 1000000.000000
73,21 : 1000000.000000
19,84 : 1000000.000000
37,98 : 1000000.000000
24,15 : 1000000.000000
70,13 : 1000000.000000
26,91 : 1000000.000000
80,56 : 1000000.000000
73,62 : 1000000.000000
70,96 : 1000000.000000
81,5 : 1000000.000000
25,84 : 1000000.000000
27,36 : 1000000.000000
5,46 : 1000000.000000
29,13 : 1000000.000000
57,24 : 1000000.000000
95,82 : 1000000.000000
45,14 : 1000000.000000
67,34 : 1000000.000000
64,43 : 1000000.000000
50,87 : 1000000.000000
8,76 : 1000000.000000
78,88 : 1000000.000000
84,3 : 1000000

In [11]:
! gnuplot -p plot.gp