## Error management utils

In [127]:
%%writefile cuda_stuff.cuh
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

#ifndef cuda_stuff_H
#define cuda_stuff_H

//MACRO TO DEBUG CUDA FUNCTIONS
/** Error checking,
 *  taken from https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
 */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

#endif

Overwriting cuda_stuff.cuh


In [128]:
%%writefile saxpy.cu
/*
 * GPU code of SAPXPY
 * Y = a.X + Y
 */

#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <math.h>

#include "cuda_stuff.cuh"

////////////////////////////////////////////////////////////////
//     Vector initialization
////////////////////////////////////////////////////////////////
void init_tab(float *tab, int len, float val) {
    for (int k=0; k<len; k++)
      tab[k]= val;
}

void print_tab(const char *tab_name, float *tab, int len){
   int k;
   printf("\n 10 first elements of %s: \n", tab_name);
   for (k=0; k<10; k++)
      printf("%.2f ",tab[k]);
   printf("\n 10 lasts : \n");
   for (k=len-10; k<len; k++)
      printf("%.2f ",tab[k]);
   printf("\n");
}



////////////////////////////////////////////////////////////////
//     SAXPY kernel
////////////////////////////////////////////////////////////////
__global__ void saxpy(float *tabX, float *tabY, int len, float a){
   /* Select the right idx */
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   if(idx < len)
     tabY[idx] = a * tabX[idx] + tabY[idx];
}



////////////////////////////////////////////////////////////////
//     Main program
////////////////////////////////////////////////////////////////
int main(int argc, char** argv){
    float *tabX_d, *tabX_h;
    float *tabY_d, * tabY_h;
    int lens[4] = {1000, 10000, 100000, 1000000000};

     /** Initialization of the grid **/
    // TODO
    int blocksize = 1024;
    dim3 block(blocksize);

    for (int i = 0; i < 4; i++) {
      float milliseconds = 0;
      cudaEvent_t start, stop;

      cudaEventCreate(&start);
      cudaEventCreate(&stop);

      int len = lens[i];
      dim3 grid((len + blocksize - 1) / blocksize);

      /** Allocation in host memory **/
      tabX_h = (float *) malloc(sizeof(float) * len);
      init_tab(tabX_h, len , 5.);
      //TODO - allocation and initialization of tabY_h
      tabY_h = (float *) malloc(sizeof(float) * len);
      init_tab(tabY_h, len, 1.);

      /** Allocation in device memory **/
      gpuErrchk(cudaMalloc((void**) &tabX_d, sizeof(float) * len));
      // TODO - allocation of tabY_d
      gpuErrchk(cudaMalloc((void**) &tabY_d, sizeof(float) * len));

      /** Pre-print of tabY **/
      printf("Before computation \n");
      print_tab("tabY_h",tabY_h, len);

      /** Transfer of data from host to device **/
      gpuErrchk(cudaMemcpy(tabX_d, tabX_h, sizeof(float) * len, cudaMemcpyHostToDevice));
      gpuErrchk(cudaMemcpy(tabY_d, tabY_h, sizeof(float) * len, cudaMemcpyHostToDevice));

      /** SaxPY kernel launching **/
      cudaEventRecord(start);
      saxpy<<<grid, block>>>(tabX_d, tabY_d, len, 2.);
      cudaEventRecord(stop);
      cudaEventSynchronize(stop);

      cudaEventElapsedTime(&milliseconds, start, stop);

      gpuErrchk(cudaPeekAtLastError());
      gpuErrchk(cudaDeviceSynchronize());

      /** Transfer of the result from device to host **/
      gpuErrchk(cudaMemcpy(tabY_h, tabY_d, sizeof(float) * len, cudaMemcpyDeviceToHost));

      /** Affichage du resultat **/
      printf("\nAfter computation\n");
      print_tab("tabY_h", tabY_h, len);

      printf("\nTime elapsed (len=%d): %f ms\n\n", len, milliseconds);

      /** Memory free **/
      cudaFree(tabX_d);
      cudaFree(tabY_d);
      free(tabX_h);
      free(tabY_h);
    }

    return EXIT_SUCCESS;
}

Overwriting saxpy.cu


In [129]:
! nvcc -arch=sm_75 saxpy.cu -o saxpy

In [130]:
! ./saxpy

Before computation 

 10 first elements of tabY_h: 
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 
 10 lasts : 
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 

After computation

 10 first elements of tabY_h: 
11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 
 10 lasts : 
11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 

Time elapsed (len=1000): 0.092192 ms

Before computation 

 10 first elements of tabY_h: 
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 
 10 lasts : 
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 

After computation

 10 first elements of tabY_h: 
11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 
 10 lasts : 
11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 

Time elapsed (len=10000): 0.012288 ms

Before computation 

 10 first elements of tabY_h: 
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 
 10 lasts : 
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 

After computation

 10 first elements of tabY_h