# TP02

## Installations

Installation de nvcc

In [1]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.11.11
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp5ig16_kw".


Installation de Google Test

In [2]:
!git clone https://github.com/google/googletest.git
!cd googletest && mkdir build && cd build && cmake .. && make -j$(nproc)

Cloning into 'googletest'...
remote: Enumerating objects: 27786, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 27786 (delta 51), reused 29 (delta 25), pack-reused 27695 (from 3)[K
Receiving objects: 100% (27786/27786), 13.30 MiB | 6.50 MiB/s, done.
Resolving deltas: 100% (20612/20612), done.
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- C

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


-------------------

## Makefile

In [84]:
%%writefile Makefile

INC	:= -I$(CUDA_HOME)/include -I. -I../headers
LIB	:= -L$(CUDA_HOME)/lib64 -lcudart -lcurand
GOOGLETEST := -Igoogletest/googletest/include -Lgoogletest/build/lib -lgtest -lgtest_main

NVCCFLAGS	:= -lineinfo -arch=sm_70 --ptxas-options=-v --use_fast_math

all:	prac2_1 prac2_2 prac2_device prac2_average prac2_average_test


prac2_1:	prac2_1.cu Makefile
	nvcc prac2_1.cu -o prac2_1 $(INC) $(NVCCFLAGS) $(LIB)

prac2_2:	prac2_2.cu Makefile
	nvcc prac2_2.cu -o prac2_2 $(INC) $(NVCCFLAGS) $(LIB)

prac2_device:	prac2_device.cu Makefile
	nvcc prac2_device.cu -o prac2_device $(INC) $(NVCCFLAGS) $(LIB)

prac2_average:	prac2_average.cu Makefile
	nvcc prac2_average.cu -o prac2_average $(INC) $(NVCCFLAGS) $(LIB)

prac2_average_test:	prac2_average_test.cu Makefile
	nvcc $(GOOGLETEST) prac2_average_test.cu -o prac2_average_test $(INC) $(NVCCFLAGS) $(LIB)

clean:
	rm -f prac2_1 prac2_2 prac2_device prac2_average prac2_average_test


Overwriting Makefile


## Programmes

### Code 1
Découpage d'un tableau en mémoire cache <br>


#### Version 1
`prac2_1.cu`

In [86]:
%%writefile prac2_1.cu


//#include <gtest/gtest.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <cuda.h>
#include <curand.h>
#include "/content/drive/MyDrive/Cours./s8/CHPS802 - GPU/TP/header/helper_cuda.h"



////////////////////////////////////////////////////////////////////////
// CUDA global constants
////////////////////////////////////////////////////////////////////////

__constant__ int   N;
__constant__ float T, r, sigma, rho, alpha, dt, con1, con2;


////////////////////////////////////////////////////////////////////////
// kernel routine
////////////////////////////////////////////////////////////////////////


__global__ void pathcalc(float *d_z, float *d_v)
{
  float s1, s2, y1, y2, payoff;
  int   ind;

  // move array pointers to correct position

  // version 1
  ind = threadIdx.x + 2*N*blockIdx.x*blockDim.x;

  // version 2
  //ind = 2*N*threadIdx.x + 2*N*blockIdx.x*blockDim.x;


  // path calculation

  s1 = 1.0f;
  s2 = 1.0f;

  for (int n=0; n<N; n++) {
    y1   = d_z[ind];
    // version 1
    ind += blockDim.x;      // shift pointer to next element
    // version 2
    //ind += 1;

    y2   = rho*y1 + alpha*d_z[ind];
    // version 1
    ind += blockDim.x;      // shift pointer to next element
    // version 2
    //ind += 1;

    s1 = s1*(con1 + con2*y1);
    s2 = s2*(con1 + con2*y2);
  }

  // put payoff value into device array

  payoff = 0.0f;
  if ( fabs(s1-1.0f)<0.1f && fabs(s2-1.0f)<0.1f ) payoff = exp(-r*T);

  d_v[threadIdx.x + blockIdx.x*blockDim.x] = payoff;
}


////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////

int main(int argc, const char **argv){

  int     NPATH=9600000, h_N=100;
  float   h_T, h_r, h_sigma, h_rho, h_alpha, h_dt, h_con1, h_con2;
  float  *h_v, *d_v, *d_z;
  double  sum1, sum2;

  // initialise card

  findCudaDevice(argc, argv);

  // initialise CUDA timing

  float milli;
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  // allocate memory on host and device

  h_v = (float *)malloc(sizeof(float)*NPATH);

  checkCudaErrors( cudaMalloc((void **)&d_v, sizeof(float)*NPATH) );
  checkCudaErrors( cudaMalloc((void **)&d_z, sizeof(float)*2*h_N*NPATH) );

  // define constants and transfer to GPU

  h_T     = 1.0f;
  h_r     = 0.05f;
  h_sigma = 0.1f;
  h_rho   = 0.5f;
  h_alpha = sqrt(1.0f-h_rho*h_rho);
  h_dt    = 1.0f/h_N;
  h_con1  = 1.0f + h_r*h_dt;
  h_con2  = sqrt(h_dt)*h_sigma;

  checkCudaErrors( cudaMemcpyToSymbol(N,    &h_N,    sizeof(h_N)) );
  checkCudaErrors( cudaMemcpyToSymbol(T,    &h_T,    sizeof(h_T)) );
  checkCudaErrors( cudaMemcpyToSymbol(r,    &h_r,    sizeof(h_r)) );
  checkCudaErrors( cudaMemcpyToSymbol(sigma,&h_sigma,sizeof(h_sigma)) );
  checkCudaErrors( cudaMemcpyToSymbol(rho,  &h_rho,  sizeof(h_rho)) );
  checkCudaErrors( cudaMemcpyToSymbol(alpha,&h_alpha,sizeof(h_alpha)) );
  checkCudaErrors( cudaMemcpyToSymbol(dt,   &h_dt,   sizeof(h_dt)) );
  checkCudaErrors( cudaMemcpyToSymbol(con1, &h_con1, sizeof(h_con1)) );
  checkCudaErrors( cudaMemcpyToSymbol(con2, &h_con2, sizeof(h_con2)) );

  // random number generation

  curandGenerator_t gen;
  checkCudaErrors( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
  checkCudaErrors( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );

  cudaEventRecord(start);
  checkCudaErrors( curandGenerateNormal(gen, d_z, 2*h_N*NPATH, 0.0f, 1.0f) );
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&milli, start, stop);

  printf("CURAND normal RNG  execution time (ms): %f,  samples/sec: %e \n", milli, 2.0*h_N*NPATH/(0.001*milli));

  // execute kernel and time it

  cudaEventRecord(start);
  pathcalc<<<NPATH/128, 128>>>(d_z, d_v);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&milli, start, stop);

  getLastCudaError("pathcalc execution failed\n");
  printf("Monte Carlo kernel execution time (ms): %f \n",milli);

  // copy back results

  checkCudaErrors( cudaMemcpy(h_v, d_v, sizeof(float)*NPATH,
                   cudaMemcpyDeviceToHost) );

  // compute average

  sum1 = 0.0;
  sum2 = 0.0;
  for (int i=0; i<NPATH; i++) {
    sum1 += h_v[i];
    sum2 += h_v[i]*h_v[i];
  }

  printf("\nAverage value and standard deviation of error  = %13.8f %13.8f\n\n",
	 sum1/NPATH, sqrt((sum2/NPATH - (sum1/NPATH)*(sum1/NPATH))/NPATH) );

  // Tidy up library

  checkCudaErrors( curandDestroyGenerator(gen) );

  // Release memory and exit cleanly

  free(h_v);
  checkCudaErrors( cudaFree(d_v) );
  checkCudaErrors( cudaFree(d_z) );

  // CUDA exit -- needed to flush printf write buffer

  // Calcul du volume de données transférées
  double data_read = 2.0 * h_N * NPATH * sizeof(float) / 1e9; // en Go
  double data_written = NPATH * sizeof(float) / 1e9; // en Go
  double total_data = data_read + data_written;

  // Calcul du taux de transfert effectif
  double execution_time = milli / 1000.0; // conversion en secondes
  double bandwidth = total_data / execution_time; // Go/s

  printf("Data read: %f GB\n", data_read);
  printf("Data written: %f GB\n", data_written);
  printf("Total data transferred: %f GB\n", total_data);
  printf("Effective memory bandwidth: %f GB/s\n", bandwidth); // Environ 40GB/s avec le chargeur

  cudaDeviceReset();



}




Writing prac2_1.cu


#### Version 2
`prac2_2.cu`

In [85]:
%%writefile prac2_2.cu


//#include <gtest/gtest.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <cuda.h>
#include <curand.h>
#include "/content/drive/MyDrive/Cours./s8/CHPS802 - GPU/TP/header/helper_cuda.h"



////////////////////////////////////////////////////////////////////////
// CUDA global constants
////////////////////////////////////////////////////////////////////////

__constant__ int   N;
__constant__ float T, r, sigma, rho, alpha, dt, con1, con2;


////////////////////////////////////////////////////////////////////////
// kernel routine
////////////////////////////////////////////////////////////////////////


__global__ void pathcalc(float *d_z, float *d_v)
{
  float s1, s2, y1, y2, payoff;
  int   ind;

  // move array pointers to correct position

  // version 1
  //ind = threadIdx.x + 2*N*blockIdx.x*blockDim.x;

  // version 2
  ind = 2*N*threadIdx.x + 2*N*blockIdx.x*blockDim.x;


  // path calculation

  s1 = 1.0f;
  s2 = 1.0f;

  for (int n=0; n<N; n++) {
    y1   = d_z[ind];
    // version 1
    //ind += blockDim.x;      // shift pointer to next element
    // version 2
    ind += 1;

    y2   = rho*y1 + alpha*d_z[ind];
    // version 1
    //ind += blockDim.x;      // shift pointer to next element
    // version 2
    ind += 1;

    s1 = s1*(con1 + con2*y1);
    s2 = s2*(con1 + con2*y2);
  }

  // put payoff value into device array

  payoff = 0.0f;
  if ( fabs(s1-1.0f)<0.1f && fabs(s2-1.0f)<0.1f ) payoff = exp(-r*T);

  d_v[threadIdx.x + blockIdx.x*blockDim.x] = payoff;
}


////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////

int main(int argc, const char **argv){

  int     NPATH=9600000, h_N=100;
  float   h_T, h_r, h_sigma, h_rho, h_alpha, h_dt, h_con1, h_con2;
  float  *h_v, *d_v, *d_z;
  double  sum1, sum2;

  // initialise card

  findCudaDevice(argc, argv);

  // initialise CUDA timing

  float milli;
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  // allocate memory on host and device

  h_v = (float *)malloc(sizeof(float)*NPATH);

  checkCudaErrors( cudaMalloc((void **)&d_v, sizeof(float)*NPATH) );
  checkCudaErrors( cudaMalloc((void **)&d_z, sizeof(float)*2*h_N*NPATH) );

  // define constants and transfer to GPU

  h_T     = 1.0f;
  h_r     = 0.05f;
  h_sigma = 0.1f;
  h_rho   = 0.5f;
  h_alpha = sqrt(1.0f-h_rho*h_rho);
  h_dt    = 1.0f/h_N;
  h_con1  = 1.0f + h_r*h_dt;
  h_con2  = sqrt(h_dt)*h_sigma;

  checkCudaErrors( cudaMemcpyToSymbol(N,    &h_N,    sizeof(h_N)) );
  checkCudaErrors( cudaMemcpyToSymbol(T,    &h_T,    sizeof(h_T)) );
  checkCudaErrors( cudaMemcpyToSymbol(r,    &h_r,    sizeof(h_r)) );
  checkCudaErrors( cudaMemcpyToSymbol(sigma,&h_sigma,sizeof(h_sigma)) );
  checkCudaErrors( cudaMemcpyToSymbol(rho,  &h_rho,  sizeof(h_rho)) );
  checkCudaErrors( cudaMemcpyToSymbol(alpha,&h_alpha,sizeof(h_alpha)) );
  checkCudaErrors( cudaMemcpyToSymbol(dt,   &h_dt,   sizeof(h_dt)) );
  checkCudaErrors( cudaMemcpyToSymbol(con1, &h_con1, sizeof(h_con1)) );
  checkCudaErrors( cudaMemcpyToSymbol(con2, &h_con2, sizeof(h_con2)) );

  // random number generation

  curandGenerator_t gen;
  checkCudaErrors( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
  checkCudaErrors( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );

  cudaEventRecord(start);
  checkCudaErrors( curandGenerateNormal(gen, d_z, 2*h_N*NPATH, 0.0f, 1.0f) );
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&milli, start, stop);

  printf("CURAND normal RNG  execution time (ms): %f,  samples/sec: %e \n", milli, 2.0*h_N*NPATH/(0.001*milli));

  // execute kernel and time it

  cudaEventRecord(start);
  pathcalc<<<NPATH/128, 128>>>(d_z, d_v);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&milli, start, stop);

  getLastCudaError("pathcalc execution failed\n");
  printf("Monte Carlo kernel execution time (ms): %f \n",milli);

  // copy back results

  checkCudaErrors( cudaMemcpy(h_v, d_v, sizeof(float)*NPATH,
                   cudaMemcpyDeviceToHost) );

  // compute average

  sum1 = 0.0;
  sum2 = 0.0;
  for (int i=0; i<NPATH; i++) {
    sum1 += h_v[i];
    sum2 += h_v[i]*h_v[i];
  }

  printf("\nAverage value and standard deviation of error  = %13.8f %13.8f\n\n",
	 sum1/NPATH, sqrt((sum2/NPATH - (sum1/NPATH)*(sum1/NPATH))/NPATH) );

  // Tidy up library

  checkCudaErrors( curandDestroyGenerator(gen) );

  // Release memory and exit cleanly

  free(h_v);
  checkCudaErrors( cudaFree(d_v) );
  checkCudaErrors( cudaFree(d_z) );

  // CUDA exit -- needed to flush printf write buffer

  // Calcul du volume de données transférées
  double data_read = 2.0 * h_N * NPATH * sizeof(float) / 1e9; // en Go
  double data_written = NPATH * sizeof(float) / 1e9; // en Go
  double total_data = data_read + data_written;

  // Calcul du taux de transfert effectif
  double execution_time = milli / 1000.0; // conversion en secondes
  double bandwidth = total_data / execution_time; // Go/s

  printf("Data read: %f GB\n", data_read);
  printf("Data written: %f GB\n", data_written);
  printf("Total data transferred: %f GB\n", total_data);
  printf("Effective memory bandwidth: %f GB/s\n", bandwidth); // Environ 40GB/s avec le chargeur

  cudaDeviceReset();



}

Writing prac2_2.cu


### Code 2


`prac2_device.cu`

In [88]:
%%writefile prac2_device.cu

////////////////////////////////////////////////////////////////////////
// GPU version of Monte Carlo algorithm using NVIDIA's CURAND library
////////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include <cuda.h>
#include <curand_kernel.h>

#include "/content/drive/MyDrive/Cours./s8/CHPS802 - GPU/TP/header/helper_cuda.h"

////////////////////////////////////////////////////////////////////////
// CUDA global constants
////////////////////////////////////////////////////////////////////////

__constant__ int   N;
__constant__ float T, r, sigma, rho, alpha, dt, con1, con2;


////////////////////////////////////////////////////////////////////////
// kernel routines -- see sections 3.5, 3.6 in cuRAND documentation
////////////////////////////////////////////////////////////////////////

__global__ void RNG_init(curandState *state)
{
  // RNG initialisation with id-based skipahead
  int id = threadIdx.x + blockIdx.x*blockDim.x;
  curand_init(1234, id, 0, &state[id]);
}


__global__ void pathcalc(curandState *device_state, float *d_v,
                         int mpath, int NPATH)
{
  float s1, s2, y1, y2, payoff;

  int id = threadIdx.x + blockIdx.x*blockDim.x;
  curandState_t state = device_state[id];

  for(int m=0; m<mpath; m++) {
    s1 = 1.0f;
    s2 = 1.0f;

    for (int n=0; n<N; n++) {
      y1 = curand_normal(&state);
      y2 = rho*y1 + alpha*curand_normal(&state);

      s1 = s1*(con1 + con2*y1);
      s2 = s2*(con1 + con2*y2);
    }

    // put payoff value into device array

    payoff = 0.0f;
    if ( fabs(s1-1.0f)<0.1f && fabs(s2-1.0f)<0.1f ) payoff = exp(-r*T);

    int payoff_id = id + m*gridDim.x*blockDim.x;
    if (payoff_id < NPATH) d_v[payoff_id] = payoff;
  }
}


////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////

int main(int argc, const char **argv){

  int     NPATH=9600000, h_N=100;
  float   h_T, h_r, h_sigma, h_rho, h_alpha, h_dt, h_con1, h_con2;
  float  *h_v, *d_v;
  double  sum1, sum2;
  curandState *state;

  // initialise card

  findCudaDevice(argc, argv);

  // initialise CUDA timing

  float milli;
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  // allocate memory on host and device

  h_v = (float *)malloc(sizeof(float)*NPATH);
  checkCudaErrors( cudaMalloc((void **)&d_v, sizeof(float)*NPATH) );
  checkCudaErrors( cudaMalloc((void **)&state, sizeof(curandState)*NPATH) );

  printf("size of curandState is %lu bytes\n",sizeof(curandState));

  // define constants and transfer to GPU

  h_T     = 1.0f;
  h_r     = 0.05f;
  h_sigma = 0.1f;
  h_rho   = 0.5f;
  h_alpha = sqrt(1.0f-h_rho*h_rho);
  h_dt    = 1.0f/h_N;
  h_con1  = 1.0f + h_r*h_dt;
  h_con2  = sqrt(h_dt)*h_sigma;

  checkCudaErrors( cudaMemcpyToSymbol(N,    &h_N,    sizeof(h_N)) );
  checkCudaErrors( cudaMemcpyToSymbol(T,    &h_T,    sizeof(h_T)) );
  checkCudaErrors( cudaMemcpyToSymbol(r,    &h_r,    sizeof(h_r)) );
  checkCudaErrors( cudaMemcpyToSymbol(sigma,&h_sigma,sizeof(h_sigma)) );
  checkCudaErrors( cudaMemcpyToSymbol(rho,  &h_rho,  sizeof(h_rho)) );
  checkCudaErrors( cudaMemcpyToSymbol(alpha,&h_alpha,sizeof(h_alpha)) );
  checkCudaErrors( cudaMemcpyToSymbol(dt,   &h_dt,   sizeof(h_dt)) );
  checkCudaErrors( cudaMemcpyToSymbol(con1, &h_con1, sizeof(h_con1)) );
  checkCudaErrors( cudaMemcpyToSymbol(con2, &h_con2, sizeof(h_con2)) );

  // calculate theoretical occupancy -- see Pro Tip blog article:
  // https://developer.nvidia.com/blog/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/

  int device;
  cudaDeviceProp props;
  cudaGetDevice(&device);
  cudaGetDeviceProperties(&props, device);

  int maxActiveBlocks, blockSize=128;
  cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, pathcalc, blockSize, 0);
  printf("maxActiveBlocks/SM = %d \n",maxActiveBlocks);
  printf("number of SMs      = %d \n",props.multiProcessorCount);
  int blocks = maxActiveBlocks*props.multiProcessorCount;

  // execute kernels

  cudaEventRecord(start);
  RNG_init<<<blocks, 128>>>(state);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&milli, start, stop);

  getLastCudaError("RNG_init execution failed\n");
  printf("RNG_init kernel execution time (ms): %f \n",milli);

  int paths_per_thread = (NPATH-1)/(128*blocks) + 1;
  cudaEventRecord(start);
  pathcalc<<<blocks, 128>>>(state,d_v,paths_per_thread,NPATH);
  cudaEventRecord(stop);

  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&milli, start, stop);

  getLastCudaError("pathcalc execution failed\n");
  printf("pathcalc kernel execution time (ms): %f \n",milli);

  // copy back results

  checkCudaErrors( cudaMemcpy(h_v, d_v, sizeof(float)*NPATH,
                   cudaMemcpyDeviceToHost) );

  // compute average

  sum1 = 0.0;
  sum2 = 0.0;
  for (int i=0; i<NPATH; i++) {
    sum1 += h_v[i];
    sum2 += h_v[i]*h_v[i];
  }

  printf("\nAverage value and standard deviation of error  = %13.8f %13.8f\n\n",
	 sum1/NPATH, sqrt((sum2/NPATH - (sum1/NPATH)*(sum1/NPATH))/NPATH) );

  // Release memory and exit cleanly

  free(h_v);
  checkCudaErrors( cudaFree(d_v) );

  // CUDA exit -- needed to flush printf write buffer

  cudaDeviceReset();

}


Writing prac2_device.cu




---



### Code 3

Calcule de az^2 + bz +c

#### Version normal
`prac2_average.cu`

In [89]:
%%writefile prac2_average.cu


#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "/content/drive/MyDrive/Cours./s8/CHPS802 - GPU/TP/header/helper_cuda.h"



////////////////////////////////////////////////////////////////////////
// CUDA global constants
////////////////////////////////////////////////////////////////////////

__constant__ float  a=20.0f, b=25.0f, c=15.0f;



////////////////////////////////////////////////////////////////////////
// kernel routine
////////////////////////////////////////////////////////////////////////

__global__ void kernel_gen_rand_nums(unsigned long seed, float *d_tab, int nb_val)
{
    curandState state;
    curand_init(seed, threadIdx.x, 0, &state);

    int tid = threadIdx.x + blockDim.x*blockIdx.x;
    double z, som=0;


    for(int i =0; i<nb_val; i++){
      z = curand_normal(&state);  // Génère un nombre aléatoire entre 0 et 1
      som += (a*pow(z,2.0) + b * z + c);
    }

    d_tab[tid] = som/nb_val;

}


////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////

int main(int argc, const char **argv){

  // variables
  float *d_tab, *h_tab; // tableau contenant la valeur générée par chaque thread
  int d_size_tab; // Taille du tableau
  int nb_val;             // Nombre de valeurs générées par thread
  int nb_blocks, nb_threads; // Nombre de blocs et nombre de threads
  float som_total=0, moyenne=0;
  const float d_a=20.0f, d_c=15.0f;

  curandState *devStates; // Etat du générateur


  // init size blocks / threads / tab
  nb_blocks = 128;
  nb_threads = 128;
  d_size_tab = nb_blocks * nb_threads;
  nb_val = 200;


  // Allocation
  cudaMalloc((void**)&devStates, nb_threads * sizeof(curandState));
  checkCudaErrors(cudaMallocManaged(&d_tab, d_size_tab*sizeof(float)));
  h_tab = (float*)malloc(sizeof(float) * d_size_tab);



  // execute kernel

  kernel_gen_rand_nums<<<nb_blocks, nb_threads>>>(time(NULL), d_tab, nb_val);

  cudaDeviceSynchronize();


  // Copy device 2 host
  checkCudaErrors( cudaMemcpy(h_tab, d_tab, sizeof(float)*d_size_tab, cudaMemcpyDeviceToHost) );


  // Calcul moyenne

  for(int i=0; i<d_size_tab; i++){
    som_total += h_tab[i];
  }

  moyenne = som_total / d_size_tab;

  printf("Moyenne: %lf\n", moyenne);
  printf("a + c: %lf\n", d_a+d_c);




  // Libération de la mémoire
  free(h_tab);
  checkCudaErrors( cudaFree(d_tab) );
  checkCudaErrors( cudaFree(devStates) );


  // CUDA exit -- needed to flush printf write buffer
  cudaDeviceReset();

}

Overwriting prac2_average.cu


#### Version test unitaire
`prac2_average_test.cu`

In [90]:
%%writefile prac2_average_test.cu

#include <gtest/gtest.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include "/content/drive/MyDrive/Cours./s8/CHPS802 - GPU/TP/header/helper_cuda.h"



////////////////////////////////////////////////////////////////////////
// CUDA global constants
////////////////////////////////////////////////////////////////////////

__constant__ float  a=258.4f, b=27.05f, c=451.358f;


////////////////////////////////////////////////////////////////////////
// kernel routine
////////////////////////////////////////////////////////////////////////



__global__ void kernel_gen_rand_nums(unsigned long seed, float *d_tab, int nb_val)
{
    curandState state;
    curand_init(seed, threadIdx.x, 0, &state);

    int tid = threadIdx.x + blockDim.x*blockIdx.x;
    double z, som=0;


    for(int i =0; i<nb_val; i++){
      z = curand_normal(&state);  // Génère un nombre aléatoire entre 0 et 1
      som += (a*pow(z,2.0) + b * z + c);
    }

    d_tab[tid] = som/nb_val;

}


////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////

TEST(CudaTest, CalculMoyenne) {
  // variables
  float *d_tab, *h_tab; // tableau contenant la valeur générée par chaque thread
  int d_size_tab; // Taille du tableau
  int nb_val;             // Nombre de valeurs générées par thread
  int nb_blocks, nb_threads; // Nombre de blocs et nombre de threads
  float som_total=0, moyenne=0;
  const float d_a=258.4f, d_c=451.358f;

  curandState *devStates; // Etat du générateur


  // init size blocks / threads / tab
  nb_blocks = 128;
  nb_threads = 128;
  d_size_tab = nb_blocks * nb_threads;
  nb_val = 200;


  // Allocation
  cudaMalloc((void**)&devStates, nb_threads * sizeof(curandState));
  checkCudaErrors(cudaMallocManaged(&d_tab, d_size_tab*sizeof(float)));
  h_tab = (float*)malloc(sizeof(float) * d_size_tab);



  // execute kernel

  kernel_gen_rand_nums<<<nb_blocks, nb_threads>>>(time(NULL), d_tab, nb_val);

  cudaDeviceSynchronize();


  // Copy device 2 host
  checkCudaErrors( cudaMemcpy(h_tab, d_tab, sizeof(float)*d_size_tab, cudaMemcpyDeviceToHost) );


  // Calcul moyenne

  for(int i=0; i<d_size_tab; i++){
    som_total += h_tab[i];
  }

  moyenne = som_total / d_size_tab;

  printf("Moyenne: %lf\n", moyenne);
  printf("a + c: %lf\n", d_a+d_c);

  // Test si la moyenne obtenue est bien environ égale à A + C avec une marge d'erreur de 1.5
  EXPECT_NEAR(moyenne, d_a+d_c, 1.5);


  // Libération de la mémoire
  free(h_tab);
  checkCudaErrors( cudaFree(d_tab) );
  checkCudaErrors( cudaFree(devStates) );


  // CUDA exit -- needed to flush printf write buffer
  cudaDeviceReset();

}

int main(int argc, char **argv) {
    ::testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();
}

Overwriting prac2_average_test.cu


## Compilation & exécution

### Compilation cuda

In [91]:
!make

nvcc prac2_device.cu -o prac2_device -I/include -I. -I../headers -lineinfo -arch=sm_70 --ptxas-options=-v --use_fast_math -L/lib64 -lcudart -lcurand
ptxas info    : 218048 bytes gmem, 108 bytes cmem[3], 64 bytes cmem[4]
ptxas info    : Compiling entry function '_Z8pathcalcP17curandStateXORWOWPfii' for 'sm_70'
ptxas info    : Function properties for _Z8pathcalcP17curandStateXORWOWPfii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 29 registers, 376 bytes cmem[0]
ptxas info    : Compiling entry function '_Z8RNG_initP17curandStateXORWOW' for 'sm_70'
ptxas info    : Function properties for _Z8RNG_initP17curandStateXORWOW
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 360 bytes cmem[0]
nvcc prac2_average.cu -o prac2_average -I/include -I. -I../headers -lineinfo -arch=sm_70 --ptxas-options=-v --use_fast_math -L/lib64 -lcudart -lcurand
ptxas info    : 218048 bytes gmem, 84 bytes cmem[3], 64 bytes c

### Execution

In [92]:
# Version 1
!./prac2_1

GPU Device 0: "Turing" with compute capability 7.5

CURAND normal RNG  execution time (ms): 166.990341,  samples/sec: 1.149767e+10 
Monte Carlo kernel execution time (ms): 29.559776 

Average value and standard deviation of error  =    0.41786269    0.00015237

Data read: 7.680000 GB
Data written: 0.038400 GB
Total data transferred: 7.718400 GB
Effective memory bandwidth: 261.111584 GB/s


In [93]:
# Version 2
!./prac2_2

GPU Device 0: "Turing" with compute capability 7.5

CURAND normal RNG  execution time (ms): 107.004028,  samples/sec: 1.794325e+10 
Monte Carlo kernel execution time (ms): 90.240799 

Average value and standard deviation of error  =    0.41793859    0.00015237

Data read: 7.680000 GB
Data written: 0.038400 GB
Total data transferred: 7.718400 GB
Effective memory bandwidth: 85.531158 GB/s


On remarque qu'avec la version 2, c'est bien plus lent et celà est du aux accès mémoires. <br>
Dans la 1ère version on charge dans la longueur du cache tout le tableau. <br>
Dans la 2ème version on charge une case sur deux donc le tableau complet n'est pas chargé à la suite ce qui fait qu'on n'a pas cette capacité de mémoire contigue.

---



In [None]:
!./prac2_device

GPU Device 0: "Turing" with compute capability 7.5

size of curandState is 48 bytes
maxActiveBlocks/SM = 8 
number of SMs      = 40 
RNG_init kernel execution time (ms): 1.703936 
pathcalc kernel execution time (ms): 23.821184 

Average value and standard deviation of error  =    0.41802440    0.00015237





---



In [83]:
!./prac2_average

Moyenne: 34.968338
a + c: 35.000000


In [80]:
!./prac2_average_test

[0;32m[----------] [mGlobal test environment set-up.
[0;32m[----------] [m1 test from CudaTest
[0;32m[ RUN      ] [mCudaTest.CalculMoyenne
Moyenne: 710.186951
a + c: 709.757996
[0;32m[       OK ] [mCudaTest.CalculMoyenne (143 ms)
[0;32m[----------] [m1 test from CudaTest (143 ms total)

[0;32m[----------] [mGlobal test environment tear-down
[0;32m[  PASSED  ] [m1 test.
