# TP01 - Calcul matriciel avec Cuda

Installation et chargement des ressources nécessaires à la compilation et l'execution de Cuda

In [1]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.11.11
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpaw58aoas".


1er programme de découverte de Cuda

In [2]:
%%cuda
#include <stdio.h>
__global__ void hello(){
  printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}


int main(){
  hello<<<4, 3>>>();
  printf("test\n");

  cudaDeviceSynchronize();
}

test
Hello from block: 2, thread: 0
Hello from block: 2, thread: 1
Hello from block: 2, thread: 2
Hello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 0, thread: 2
Hello from block: 3, thread: 0
Hello from block: 3, thread: 1
Hello from block: 3, thread: 2
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1
Hello from block: 1, thread: 2



prac1c.cu - addition de vecteurs

In [3]:
%%cuda
//
// include files
//

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include "/content/drive/MyDrive/Cours./s8/CHPS802 - GPU/TP/TP01/Fichiers dentête-20250114/helper_cuda.h"


//
// kernel routine
//

__global__ void my_first_kernel(float *tab_A, float *tab_B, float *res)
{
    int tid = threadIdx.x + blockDim.x*blockIdx.x;

    res[tid] = tab_A[tid] + tab_B[tid];

    printf("%d > %lf | %lf\n", threadIdx.x, tab_A[tid], tab_B[tid]);
}


//
// main code
//

int main(int argc, const char **argv)
{

  int nblocks, nthreads, nsize, n;
  float *h_tab_A, *h_tab_B, *h_res, *h_res_test;
  float *d_tab_A, *d_tab_B, *d_res;
  const int tab_size = 32;

  // initialise card

  findCudaDevice(argc, argv);

  // set number of blocks, and threads per block

  nblocks  = 4;
  nthreads = 8;
  nsize    = nblocks*nthreads ;

  // allocate memory for array

  h_tab_A = (float*)malloc(tab_size*sizeof(float));
  h_tab_B = (float*)malloc(tab_size*sizeof(float));
  h_res = (float*)malloc(tab_size*sizeof(float));
  h_res_test = (float*)malloc(tab_size*sizeof(float));

  cudaMalloc((void**)&d_tab_A, tab_size * sizeof(float));
  cudaMalloc((void**)&d_tab_B, tab_size * sizeof(float));
  cudaMalloc((void**)&d_res, tab_size * sizeof(float));


  // init tab

  for(int i=0; i<tab_size; i++){
      h_tab_A[i] = i+1;
      h_tab_B[i] = i+5;
  }


  // copy data host 2 device

  cudaMemcpy(d_tab_A, h_tab_A, tab_size*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_tab_B, h_tab_B, tab_size*sizeof(float), cudaMemcpyHostToDevice);


  // execute kernel

  my_first_kernel<<<nblocks,nthreads>>>(d_tab_A, d_tab_B, d_res);
  getLastCudaError("my_first_kernel execution failed\n");


  // copy data device 2 host

  cudaMemcpy(h_res, d_res, tab_size*sizeof(float), cudaMemcpyDeviceToHost);


  // Test host & device

  for(int i=0; i<tab_size; i++){
      h_res_test[i] = h_tab_A[i] + h_tab_B[i];
  }

  // print data

  for(int i=0; i<tab_size; i++){
      printf("%lf ", h_res[i]);
  }
  printf("\n");

  for(int i=0; i<tab_size; i++){
      printf("%lf ", h_res_test[i]);
  }
  printf("\n");


  // synchornize host & device

  cudaDeviceSynchronize();


  // free memory

  checkCudaErrors(cudaFree(d_tab_A));
  checkCudaErrors(cudaFree(d_tab_B));
  checkCudaErrors(cudaFree(d_res));
  free(h_tab_A);
  free(h_tab_B);
  free(h_res);


  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();

  return 0;
}


0 > 17.000000 | 21.000000
1 > 18.000000 | 22.000000
2 > 19.000000 | 23.000000
3 > 20.000000 | 24.000000
4 > 21.000000 | 25.000000
5 > 22.000000 | 26.000000
6 > 23.000000 | 27.000000
7 > 24.000000 | 28.000000
0 > 25.000000 | 29.000000
1 > 26.000000 | 30.000000
2 > 27.000000 | 31.000000
3 > 28.000000 | 32.000000
4 > 29.000000 | 33.000000
5 > 30.000000 | 34.000000
6 > 31.000000 | 35.000000
7 > 32.000000 | 36.000000
0 > 9.000000 | 13.000000
1 > 10.000000 | 14.000000
2 > 11.000000 | 15.000000
3 > 12.000000 | 16.000000
4 > 13.000000 | 17.000000
5 > 14.000000 | 18.000000
6 > 15.000000 | 19.000000
7 > 16.000000 | 20.000000
0 > 1.000000 | 5.000000
1 > 2.000000 | 6.000000
2 > 3.000000 | 7.000000
3 > 4.000000 | 8.000000
4 > 5.000000 | 9.000000
5 > 6.000000 | 10.000000
6 > 7.000000 | 11.000000
7 > 8.000000 | 12.000000
6.000000 8.000000 10.000000 12.000000 14.000000 16.000000 18.000000 20.000000 22.000000 24.000000 26.000000 28.000000 30.000000 32.000000 34.000000 36.000000 38.000000 40.000000 42.0

prac1b.cu - gestion des erreurs <br>
Utilisation de `checkCudaErrors`

In [4]:
%%cuda
//
// include files
//

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include "/content/drive/MyDrive/Cours./s8/CHPS802 - GPU/TP/TP01/Fichiers dentête-20250114/helper_cuda.h"


//
// kernel routine
//

__global__ void my_first_kernel(float *x)
{
  int tid = threadIdx.x + blockDim.x*blockIdx.x;

  printf("thread : %d\n", tid);

  x[tid] = (float) threadIdx.x;
}


//
// main code
//

int main(int argc, const char **argv)
{
  float *h_x, *d_x;
  int   nblocks, nthreads, nsize, n;

  // initialise card

  findCudaDevice(argc, argv);

  // set number of blocks, and threads per block

  nblocks  = 2;
  nthreads = 8;
  nsize    = nblocks*nthreads ;


  // allocate memory for array

  h_x = (float *)malloc(nsize*sizeof(float));
  checkCudaErrors(cudaMalloc((void **)&d_x, nsize*sizeof(float)));

  // execute kernel

  my_first_kernel<<<nblocks,nthreads>>>(d_x);
  getLastCudaError("my_first_kernel execution failed\n");

  // copy back results and print them out

  checkCudaErrors( cudaMemcpy(h_x,d_x,nsize*sizeof(float),
                 cudaMemcpyDeviceToHost) );

  for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %f \n",n,h_x[n]);

  // free memory

  checkCudaErrors(cudaFree(d_x));
  free(h_x);

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();

  return 0;
}


thread : 8
thread : 9
thread : 10
thread : 11
thread : 12
thread : 13
thread : 14
thread : 15
thread : 0
thread : 1
thread : 2
thread : 3
thread : 4
thread : 5
thread : 6
thread : 7
 n,  x  =  0  0.000000 
 n,  x  =  1  1.000000 
 n,  x  =  2  2.000000 
 n,  x  =  3  3.000000 
 n,  x  =  4  4.000000 
 n,  x  =  5  5.000000 
 n,  x  =  6  6.000000 
 n,  x  =  7  7.000000 
 n,  x  =  8  0.000000 
 n,  x  =  9  1.000000 
 n,  x  =  10  2.000000 
 n,  x  =  11  3.000000 
 n,  x  =  12  4.000000 
 n,  x  =  13  5.000000 
 n,  x  =  14  6.000000 
 n,  x  =  15  7.000000 



prac1a.cu - pas de gestion d'erreur

In [5]:
%%cuda
//
// include files
//

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

//
// kernel routine
//

__global__ void my_first_kernel(float *x)
{
  int tid = threadIdx.x + blockDim.x*blockIdx.x;

  x[tid] = (float) threadIdx.x;
}


//
// main code
//

int main(int argc, char **argv)
{
  float *h_x, *d_x;
  int   nblocks, nthreads, nsize, n;

  // set number of blocks, and threads per block

  nblocks  = 0;
  nthreads = 10000;
  nsize    = nblocks*nthreads ;

  // allocate memory for array

  h_x = (float *)malloc(nsize*sizeof(float));
  cudaMalloc((void **)&d_x, nsize*sizeof(float));

  // execute kernel

  my_first_kernel<<<nblocks,nthreads>>>(d_x);

  // copy back results and print them out

  cudaMemcpy(h_x,d_x,nsize*sizeof(float),cudaMemcpyDeviceToHost);

  for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %f \n",n,h_x[n]);

  // free memory

  cudaFree(d_x);
  free(h_x);

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();

  return 0;
}





### Test unitaire avec Google Test

Installation de Google Test

In [6]:
!git clone https://github.com/google/googletest.git
!cd googletest && mkdir build && cd build && cmake .. && make -j$(nproc)

Cloning into 'googletest'...
remote: Enumerating objects: 27769, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 27769 (delta 40), reused 19 (delta 19), pack-reused 27692 (from 4)[K
Receiving objects: 100% (27769/27769), 13.33 MiB | 16.65 MiB/s, done.
Resolving deltas: 100% (20594/20594), done.
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- 

Test unitaire sur la somme de vecteurs
 - Groupe : CudaTest
 - Nom : SommeVecteur
<br>
ASSERT : s'arrête au 1er plantage <br>
EXPECT : Continue

In [7]:
#on crée un fichier qui peut être compilé
%%writefile test.cu

#include <gtest/gtest.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

__global__ void my_first_kernel(float *tab_A, float *tab_B, float *res) {
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    res[tid] = tab_A[tid] + tab_B[tid] +1; // "+1" qui fausse la somme
}

TEST(CudaTest, SommeVecteur) {
    const int tab_size = 32;
    float *h_tab_A, *h_tab_B, *h_res;
    float *d_tab_A, *d_tab_B, *d_res;

    h_tab_A = (float*)malloc(tab_size * sizeof(float));
    h_tab_B = (float*)malloc(tab_size * sizeof(float));
    h_res = (float*)malloc(tab_size * sizeof(float));

    cudaMalloc((void**)&d_tab_A, tab_size * sizeof(float));
    cudaMalloc((void**)&d_tab_B, tab_size * sizeof(float));
    cudaMalloc((void**)&d_res, tab_size * sizeof(float));

    for (int i = 0; i < tab_size; i++) {
        h_tab_A[i] = i + 1;
        h_tab_B[i] = i + 5;
    }

    cudaMemcpy(d_tab_A, h_tab_A, tab_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_tab_B, h_tab_B, tab_size * sizeof(float), cudaMemcpyHostToDevice);

    my_first_kernel<<<4, 8>>>(d_tab_A, d_tab_B, d_res);
    cudaDeviceSynchronize();

    cudaMemcpy(h_res, d_res, tab_size * sizeof(float), cudaMemcpyDeviceToHost);

    for (int i = 0; i < tab_size; i++) {
        //EXPECT_EQ(h_res[i], h_tab_A[i] + h_tab_B[i]);
        ASSERT_EQ(h_res[i], h_tab_A[i] + h_tab_B[i]);
    }

    cudaFree(d_tab_A);
    cudaFree(d_tab_B);
    cudaFree(d_res);
    free(h_tab_A);
    free(h_tab_B);
    free(h_res);
}

int main(int argc, char **argv) {
    ::testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();
}

Writing test.cu


Compilation avec `nvcc` et les flags de Google Test

In [8]:
!nvcc -Igoogletest/googletest/include -Lgoogletest/build/lib -lgtest -lgtest_main test.cu -o test_unitaire

Execution du test unitaire

In [9]:
!./test_unitaire

[0;32m[----------] [mGlobal test environment set-up.
[0;32m[----------] [m1 test from CudaTest
[0;32m[ RUN      ] [mCudaTest.SommeVecteur
test.cu:40: Failure
Expected equality of these values:
  h_res[i]
    Which is: 7
  h_tab_A[i] + h_tab_B[i]
    Which is: 6

[0;31m[  FAILED  ] [mCudaTest.SommeVecteur (159 ms)
[0;32m[----------] [m1 test from CudaTest (159 ms total)

[0;32m[----------] [mGlobal test environment tear-down
[0;32m[  PASSED  ] [m0 tests.
[0;31m[  FAILED  ] [m1 test, listed below:
[0;31m[  FAILED  ] [mCudaTest.SommeVecteur

 1 FAILED TEST


Ici on remarque que le test unitaire à échoué car dans le Kernel j'ai ajouté un `+1` qui fausse la somme. De plus, l'utilisation du `ASSERT` permet au programme de s'arrêter à la 1ère erreur.