[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1mk0QXjREp-k_J-pdFfmgoC7-EVmgPyAo#scrollTo=g4gyOZKkHDVU)

# ✔ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

## NVCC Plugin for Jupyter notebook

*Usage*:


*   Load Extension `%load_ext nvcc_plugin`
*   Mark a cell to be treated as cuda cell
`%%cuda --name example.cu --compile false`

**NOTE**: The cell must contain either code or comments to be run successfully. It accepts 2 arguments. `-n | --name` - which is the name of either CUDA source or Header. The name parameter must have extension `.cu` or `.h`. Second argument -c | --compile; default value is false. The argument is a flag to specify if the cell will be compiled and run right away or not. It might be usefull if you're playing in the main function

*  We are ready to run CUDA C/C++ code right in your Notebook. For this we need explicitly say to the interpreter, that we want to use the extension by adding `%%cu` at the beginning of each cell with CUDA code. 




In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

In [None]:
%cd /home/grossi/CUDA
%ls

# ✔ VS code on Colab

In [None]:
# 1. Install the colab-code package...
!pip install colabcode

In [None]:
# 2. Import and launch...
from colabcode import ColabCode
ColabCode()

# ✔ DeviceQuery

In [None]:
# DeviceQuery dell'attuale device
!nvcc /home/grossi/CUDA/lab4/deviceQuery/deviceQuery.cu -o deviceQuery
!deviceQuery

# Moltiplicazione matriciale con SMEM (Shared Memory)


In [None]:
#@title working directory: **matProdSMEM**
%cd /home/grossi/CUDA/lab5/matProdSMEM
%ls

In [None]:
%%writefile /home/grossi/CUDA/lab5/matProdSMEM/matProdSMEM.cu

#include <stdio.h>
#include <stdlib.h>
#include "../../utils/common.h"

#define IDX(i,j,n) (i*n+j)
#define ABS(x,y) (x-y>=0?x-y:y-x)
#define N 1024
#define P 1024
#define M 1024

#define BLOCK_SIZE 16

/*
 * Kernel for matrix product with static SMEM
 *      C  =  A  *  B
 *    (NxM) (MxP) (PxM)
 */
__global__ void matProdSMEMstatic(float* A, float* B, float* C) {
	// indexes
	uint row = blockIdx.y * blockDim.y + threadIdx.y;
	uint col = blockIdx.x * blockDim.x + threadIdx.x;

	// target: compute the right sum for the given row and col
	float sum = 0.0;

	// static shared memory
	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

	// loop over blocks from block row of matrix A
	// and block column of matrix B
	uint numBlocks = (P + BLOCK_SIZE - 1) / BLOCK_SIZE;
	for (uint m = 0; m < numBlocks; m++) {

		// copy block from matrix to shared memory
		uint r = m * BLOCK_SIZE + threadIdx.y;
		uint c = m * BLOCK_SIZE + threadIdx.x;
		As[threadIdx.y][threadIdx.x] = A[IDX(row, c, P)];
		Bs[threadIdx.y][threadIdx.x] = B[IDX(r, col, M)];

		//---------------------------------------------------------------
		__syncthreads();  //  BARRIER SYNC on SMEM loading

		// length of this part of row-column product is BLOCK_SIZE
		// except for last block when it may be smaller
		uint K = BLOCK_SIZE;
		if (m == numBlocks - 1) K = P - m * BLOCK_SIZE; // tune last block

		// compute this part of row-column product
		for (uint k = 0; k < K; k++)
			sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];

		//---------------------------------------------------------------
		__syncthreads();  //  BARRIER SYNC on prod over blocks
		// Synchronize to make sure that the preceding
		// computation is done before loading two new
		// sub-matrices of A and B in the next iteration
	}

	// store computed element in matrix C
	if (row < N && col < M)
		C[row * M + col] = sum;
}


/*
 * Kernel for matrix product using dynamic SMEM
 */
__global__ void matProdSMEMdynamic(float* A, float* B, float* C, const uint SMEMsize) {
	// indexes
	uint row = blockIdx.y * blockDim.y + threadIdx.y;
	uint col = blockIdx.x * blockDim.x + threadIdx.x;

	// dynamic shared memory (inside or outside kernel)
	extern __shared__ float smem[];

	// Var As is manually set at beginning of shared
	float *As = smem;
	// Var Bs is manually set at the end of As
	float *Bs = &smem[SMEMsize];

	// loop over blocks from block row of matrix A
	// and block column of matrix B
	float sum = 0.0;
	uint numBlocks = (P + blockDim.x - 1) / blockDim.x;
	for (uint m = 0; m < numBlocks; m++) {

		// copy block from matrix to shared memory
		uint c = m * blockDim.x + threadIdx.x;
		uint r = m * blockDim.y + threadIdx.y;
		As[threadIdx.y * blockDim.y + threadIdx.x] = A[IDX(row, c, P)];
		Bs[threadIdx.y * blockDim.y + threadIdx.x] = B[IDX(r, col, M)];

		//---------------------------------------------------------------
		__syncthreads();

		// length of this part of row-column product is BLOCK_SIZE
		// except for last block when it may be smaller
		uint K = (m == numBlocks - 1 ? P - m * blockDim.x : blockDim.x);

		// compute this part of row-column product
		for (int k = 0; k < K; k++)
			sum += As[threadIdx.y * blockDim.x + k] * Bs[k * blockDim.y + threadIdx.x];

		//---------------------------------------------------------------
		__syncthreads();
	}

	// store computed element in matrix C
	if (row < N && col < M)
		C[row * M + col] = sum;
}

/*
 * Kernel for naive matrix product
 */
__global__ void matProd(float* A, float* B, float* C) {
	// indexes
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// each thread computes an entry of the product matrix
	if ((row < N) && (col < M)) {
		float sum = 0;
		for (int k = 0; k < P; k++)
			sum += A[row * P + k] * B[k * M + col];
		C[row * M + col] = sum;
	}
}

/*
 *  matrix product on CPU
 */
void matProdCPU(float* A, float* B, float* C) {

	for (int i = 0; i < N; i++)
		for (int j = 0; j < M; j++) {
			float sum = 0;
			for (int k = 0; k < P; k++)
				sum += A[i * P + k] * B[k * M + j];
			C[i * M + j] = sum;
		}
}

/*
 * Test the device
 */
unsigned long testCUDADevice(void) {
	int dev = 0;

	cudaDeviceSetCacheConfig (cudaFuncCachePreferEqual);
	cudaDeviceProp deviceProp;
	cudaSetDevice(dev);
	cudaGetDeviceProperties(&deviceProp, dev);
	printf("Device %d: \"%s\"\n", dev, deviceProp.name);
	printf("Total amount of shared memory available per block: %lu KB\n",
			deviceProp.sharedMemPerBlock / 1024);
	return deviceProp.sharedMemPerBlock;
}


/*
 * elementwise comparison between two mqdb
 */
void checkResult(float *A, float *B) {
	double epsilon = 1.0E-8;
	bool match = 1;
	for (int i = 0; i < N*M; i++)
		if (ABS(A[i], B[i]) > epsilon) {
			match = 0;
			printf("   * Arrays do not match!\n");
			break;
		}
	if (match)
		printf("   Arrays match\n\n");
}

/*
 * MAIN
 */
int main(void) {
	 // Kernels for matrix product
	 //      C  =  A  *  B
	 //    (NxM) (MxP) (PxM)
	uint rowA = N, rowB = P;
	uint colA = P, colB = M;
	uint rowC = N, colC = M;
	float *A, *B, *C, *C1;
	float *dev_A, *dev_B, *dev_C;

	// dims
	unsigned long Asize = rowA * colA * sizeof(float);
	unsigned long Bsize = rowB * colB * sizeof(float);
	unsigned long Csize = rowC * colC * sizeof(float);
	unsigned long maxSMEMbytes;
	uint nByteSMEM = 2 * BLOCK_SIZE * BLOCK_SIZE * sizeof(float);
	printf("N = %d, M = %d, P = %d\n",N,M,P);

	// test device shared memory
	maxSMEMbytes = testCUDADevice();
	if (maxSMEMbytes < nByteSMEM)
		printf("Shared memory usage WARNING: available: %lu, required: %d bytes\n",
				maxSMEMbytes, nByteSMEM);
	else
		printf("Total amount of shared memory required per block %.1f KB\n",
				(float) nByteSMEM / (float) 1024);

	// malloc host memory
	A = (float*) malloc(Asize);
	B = (float*) malloc(Bsize);
	C = (float*) malloc(Csize);
	C1 = (float*) malloc(Csize);

	// malloc device memory
	CHECK(cudaMalloc((void** )&dev_A, Asize));
	CHECK(cudaMalloc((void** )&dev_B, Bsize));
	CHECK(cudaMalloc((void** )&dev_C, Csize));
	printf("Total amount of allocated memory on GPU %lu bytes\n\n",
			Asize + Bsize + Csize);

	// fill the matrices A and B
	for (int i = 0; i < N * P; i++)
		A[i] = rand() % 10;
	for (int i = 0; i < P * M; i++)
		B[i] = rand() % 10;
	matProdCPU(A, B, C);

	// copy matrices A and B to the GPU
	CHECK(cudaMemcpy(dev_A, A, Asize, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(dev_B, B, Bsize, cudaMemcpyHostToDevice));

	/***********************************************************/
	/*              GPU matProdSMEM static SMEM               */
	/***********************************************************/
	// grid block dims = shared mem dims = BLOCK_SIZE
	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((M + block.x - 1) / block.x, (N + block.y - 1) / block.y);
	double start = seconds();
	matProdSMEMstatic<<<grid, block>>>(dev_A, dev_B, dev_C);
	CHECK(cudaDeviceSynchronize());
	printf("   Kernel matProdSMEM static elapsed time GPU = %f\n", seconds() - start);

	// copy the array 'C' back from the GPU to the CPU
	CHECK(cudaMemcpy(C1, dev_C, Csize, cudaMemcpyDeviceToHost));
	checkResult(C,C1);

	/***********************************************************/
	/*            GPU matProdSMEMD dynamic SMEM                */
	/***********************************************************/
	// set cache size
	cudaDeviceSetCacheConfig (cudaFuncCachePreferShared);

	// try with various SMEM sizes
	uint sizes[] = {8,16,32};
	for (int i = 0; i < 3; i++) {
		uint blockSize = sizes[i];
		block.x = blockSize;
		block.y = blockSize;
		grid.x = (M + block.x - 1) / block.x;
		grid.y = (N + block.y - 1) / block.y;
		uint SMEMsize = blockSize * blockSize;
		uint SMEMbyte = 2 * SMEMsize * sizeof(float);
		start = seconds();
		matProdSMEMdynamic<<< grid, block, SMEMbyte >>>(dev_A, dev_B, dev_C, SMEMsize);
		CHECK(cudaDeviceSynchronize());
		printf("   Kernel matProdSMEM dynamic (SMEM size %d) elapsed time GPU = %f\n", blockSize, seconds() - start);

		// copy the array 'C' back from the GPU to the CPU
		CHECK(cudaMemcpy(C1, dev_C, Csize, cudaMemcpyDeviceToHost));
		checkResult(C,C1);
	}

	// free the memory allocated on the GPU
	cudaFree(dev_A);
	cudaFree(dev_B);
	cudaFree(dev_C);

	cudaDeviceReset();
	return EXIT_SUCCESS;
}


In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_60 matProdSMEM.cu  -o matProdSMEM
!matProdSMEM

In [None]:
!ls -la

# Convoluzione con SMEM

In [None]:
#@title working directory: **conv**
%cd /home/grossi/CUDA/lab5/conv/
%ls

In [None]:
%%writefile /home/grossi/CUDA/lab5/conv/conv1D.cu

#include <stdlib.h>
#include <stdio.h>
#include "../../utils/common.h"

#define MASK_RADIUS  5
#define MASK_SIZE    2 * MASK_RADIUS + 1
#define BLOCK_SIZE   128
#define TILE_WIDTH   BLOCK_SIZE + MASK_SIZE - 1

__device__ __constant__ float d_mask[MASK_SIZE];

void initialData(float*, int);
void movingAverage(float*, int n);
void printData(float*, const int);
void convolutionHost(float*, float*, float*, const int);
void checkResult(float*, float*, int);

/*
 * kernel for 1D convolution: it holds only if MASK_RADIUS < BLOCK_SIZE
 */
__global__ void convolution1D(float *result, float *data, int n) {
	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;

	// shared memory size = BLOCK_SIZE + MASK
	__shared__ float tile[TILE_WIDTH];

	// boundary
	int left = blockIdx.x * blockDim.x - MASK_RADIUS;
	int right = (blockIdx.x + 1) * blockDim.x;

  // left halo
	if (threadIdx.x < MASK_RADIUS)                      
		tile[threadIdx.x] = left < 0 ? 0 : data[left + threadIdx.x];

  // center
	tile[threadIdx.x + MASK_RADIUS] = data[i];

  // right halo  
	if (threadIdx.x >= blockDim.x - MASK_RADIUS)  
		tile[threadIdx.x + MASK_SIZE - 1] = right >= n ? 0 :
				data[right + threadIdx.x - blockDim.x + MASK_RADIUS];

	__syncthreads();

	// convolution: tile * mask
	float sum = 0;
	for (int i = -MASK_RADIUS; i <= MASK_RADIUS; i++)
		sum += tile[threadIdx.x + MASK_RADIUS + i] * d_mask[i + MASK_RADIUS];

	// final result
	result[i] = sum;
}

/*
 * MAIN: convolution 1D host & device
 */
int main(int argc, char **argv) {
	// set up device
	int dev = 0;
	cudaDeviceProp deviceProp;
	CHECK(cudaGetDeviceProperties(&deviceProp, dev));
	printf("starting conv1D at device %d: %s\n", dev, deviceProp.name);
	CHECK(cudaSetDevice(dev));

	// set up array size
	int n = 1 << 24;
	int N = MASK_SIZE;

	printf("Array of size = %.1f MB\n", n/(1024.0*1024.0));

	// mem sizes
	size_t nBytes = n * sizeof(float);
	size_t nBytes_mask = N * sizeof(float);

	// grid configuration
	dim3 block(BLOCK_SIZE);
	dim3 grid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);

	// allocate host memory
	float *h_data = (float *) malloc(nBytes);
	float *h_result = (float *) malloc(nBytes);
	float *result = (float *) malloc(nBytes);
	float *h_mask = (float *) malloc(nBytes_mask);

	//  initialize host array
	movingAverage(h_mask, N);
	initialData(h_data, n);

	// convolution on host
	double start = seconds();
	convolutionHost(h_data, result, h_mask, n);
	double hostElaps = seconds() - start;

	// allocate device memory
	float *d_data, *d_result;
	CHECK(cudaMalloc((void**)&d_data, nBytes));
	CHECK(cudaMalloc((void**)&d_result, nBytes));

	// copy data from host to device
	CHECK(cudaMemcpy(d_data, h_data, nBytes, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpyToSymbol(d_mask, h_mask, nBytes_mask));

	start = seconds();
	convolution1D<<<grid, block>>>(d_result, d_data, n);
	CHECK(cudaDeviceSynchronize());
	double devElaps = seconds() - start;
  printf("Times:\n");
	printf("   - CPU elapsed time = %f\n", hostElaps);
  printf("   - GPU elapsed time = %f\n", devElaps);
  printf("   - Speed-up (ratio) = %f\n", hostElaps / devElaps);

	CHECK(cudaMemcpy(h_result, d_result, nBytes, cudaMemcpyDeviceToHost));

	// check result
	checkResult(h_result, result, n);

	// free host and device memory
	CHECK(cudaFree(d_result));
	CHECK(cudaFree(d_data));
	free(h_data);
	free(h_mask);
	free(h_result);
	free(result);

	// reset device
	CHECK(cudaDeviceReset());
	return EXIT_SUCCESS;
}

void initialData(float *h_data, int n) {
	// initialize the data
	for (int i = 0; i < n; i++)
		h_data[i] = 10.0;
}

void movingAverage(float *h_mask, int n) {
	// initialize mask moving average
	for (int i = 0; i < n; i++)
		h_mask[i] = 1.0 / ((float) n);
	return;
}

void printData(float *a, const int size) {
	printf("\n");
	for (int i = 0; i < size; i++)
		printf("%.2f ", a[i]);
	printf("\n");
	return;
}

void convolutionHost(float *data, float *result, float *mask, const int n) {
	for (int i = 0; i < n; i++) {
		float sum = 0;
		for (int j = 0; j < MASK_SIZE; j++) {
			int idx = i - MASK_RADIUS + j;
			if (idx >= 0 && idx < n)
				sum += data[idx] * mask[j];
		}
		result[i] = sum;
	}
}

void checkResult(float *d_result, float *h_result, int n) {
	double epsilon = 1.0E-8;

	for (int i = 0; i < n; i++)
		if (abs(h_result[i] - d_result[i]) > epsilon) {
			printf("different on entry (%d) |h_result - d_result| >  %f\n", i,
					epsilon);
			break;
		}
}



In [None]:
# Compilazione ed esecuzione
!nvcc -arch=sm_60  conv1D.cu -o conv1D
!./conv1D

In [None]:
%%writefile /home/grossi/CUDA/lab5/conv/conv2D.cu
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "../../utils/common.h"

#define DATA_WIDTH   (24)
#define DATA_HEIGHT  (24)
#define BLOCK_SIZE   8
#define MASK_RADIUS  2
#define MASK_SIZE    (2 * MASK_RADIUS + 1)
#define TILE_WIDTH   (BLOCK_SIZE + MASK_SIZE - 1)
#define DEBUG 1

// constant mem
__constant__ float M_dev[MASK_SIZE*MASK_SIZE];

/*
 * kernel for convolution 2D (it holds only if MASK_RADIUS < BLOCK_SIZE)
 */
__global__ void conv2D(float *A, float *B) {
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;
	int RAD = MASK_RADIUS;
  int BmR = BLOCK_SIZE - RAD;
  int W = DATA_WIDTH;
  int H = DATA_HEIGHT;
	int m = MASK_SIZE;

	// shared mem
	__shared__ float A_s[TILE_WIDTH][TILE_WIDTH];

  // START SHARED MEMORY LOADING

  // 1. copy the tile upper halo 
  if ((threadIdx.y < RAD) ) {
    
    // left corner
    if (threadIdx.x < RAD && (x-RAD) >= 0 && (y-RAD) >= 0)
      A_s[threadIdx.y][threadIdx.x] = A[(y-RAD) * W + x - RAD];

    // right corner
    if (threadIdx.x >= BmR && (x+RAD) < W && (y-RAD) >= 0) 
      A_s[threadIdx.y][threadIdx.x + 2*RAD] = A[(y-RAD) * W + x + RAD];
    
    // edge
    if ((y-RAD) >= 0) 
      A_s[threadIdx.y][threadIdx.x + RAD] = A[(y-RAD) * W + x ];  
  }

  // 2. copy the tile bottom halo 
  if (threadIdx.y >= BmR) {
    
    // left corner
    if (threadIdx.x < RAD && (x-RAD) >= 0 && (y+RAD) < H)
      A_s[threadIdx.y + 2*RAD][threadIdx.x] = A[(y+RAD) * W + x - RAD];

    // right corner
    if (threadIdx.x >= BmR && (y+RAD) < H) 
      A_s[threadIdx.y + 2*RAD][threadIdx.x + 2*RAD] = A[(y+RAD) * W + x + RAD];
    
    // edge
    if ((y+RAD) < H) 
      A_s[threadIdx.y + 2*RAD][threadIdx.x + RAD] = A[(y+RAD) * W + x];  
  }

  // 3. copy the tile left-edge halo 
  if (threadIdx.x < RAD) 
    // edge
    if ((x-RAD) >= 0) 
      A_s[threadIdx.y + RAD][threadIdx.x] = A[y * W + x - RAD];  

  // 4. copy the tile right-edge halo 
  if (threadIdx.x >= BmR) 
    // edge
    if ((x+RAD) < W) 
      A_s[threadIdx.y + RAD][threadIdx.x + 2*RAD] = A[y * W + x + RAD];  
      

  // 5. copy the tile center <-> block
	A_s[RAD + threadIdx.y][RAD + threadIdx.x] = A[y*W+x];
	
  // END SHARED MEMORY LOADING

	__syncthreads();

  
  if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && threadIdx.y == 0) {
    printf("BLOCK(%d,%d) - TILE_WIDTH = %d\n",blockIdx.x, blockIdx.y, TILE_WIDTH);
    for (int i = 0; i < TILE_WIDTH; i++) {
      for (int j = 0; j < TILE_WIDTH; j++) 
        printf("%1.0f ", A_s[i][j]);
		  printf("\n");
    }
  }

	float conv_sum = 0.0;
	for (int i = 0; i < m; i++)
		for (int j = 0; j < m; j++)
			conv_sum += A_s[threadIdx.y+i][threadIdx.x+j] * M_dev[i*m + j];
	
  // store conv result
  B[x*W+y] = conv_sum;
}

/*
 * Average filter
 */
void Avg_mask(float *mask) {
	int n = MASK_SIZE;
	for (int i = 0; i < n*n; i++)
		mask[i] = (float) 1.0 / (n * n);
}


/*
 * main
 */
int main(void) {

  // check params
  if (MASK_RADIUS >= BLOCK_SIZE) {
    printf("ERROR: it holds only if MASK_RADIUS < BLOCK_SIZE!\n");
    return 1;
  }

	int nW = DATA_WIDTH;
  int nH = DATA_HEIGHT;
	int b = BLOCK_SIZE;

	float M[MASK_SIZE*MASK_SIZE]; // const size
	float *A, *B, *A_dev, *B_dev;
	int datasize = nW * nH * sizeof(float);
  int masksize = MASK_SIZE*MASK_SIZE * sizeof(float);

  printf("Data size: %.2f (MB)\n", (float)datasize/(1024.0*1024.0));
	printf("Initializing data...\n");
	A = (float *) malloc(datasize);
	B = (float *) malloc(datasize);

	// initialize data
	for (int i = 0; i < nH; i++)
		for (int j = 0; j < nW; j++)
			A[i*nW+j] = rand()%10;

  // initialize mask 
	Avg_mask(M);

#if DEBUG
	// print data
	printf("Print matrix A...\n");
	for (int i = 0; i < nH; i++) {
    if (i%8 == 0 && i>0)
      printf("\n");

		for (int j = 0; j < nW; j++)
      if (j%8 == 0 && j>0)
			  printf(" %0.0f ", A[i*nW+j]);
      else
        printf("%0.0f ", A[i*nW+j]);
		printf("\n");
	}

	printf("Print matrix M ...\n");
	for (int i = 0; i < MASK_SIZE; i++) {
		for (int j = 0; j < MASK_SIZE; j++)
			  printf(" %1.2f ", M[i * MASK_SIZE + j]);
		printf("\n");
	}
#endif

	// cuda allocation 
	CHECK(cudaMemcpyToSymbol(M_dev, M, masksize));
	CHECK(cudaMalloc((void **) &A_dev, datasize));
	CHECK(cudaMalloc((void **) &B_dev, datasize));
	CHECK(cudaMemcpy(A_dev, A, datasize, cudaMemcpyHostToDevice));
	
	// block, grid dims, kernel
	dim3 block(b, b);
	dim3 grid((nW+b-1)/b, (nH+b-1)/b);
  double iStart, iElaps;
	iStart = seconds();
	conv2D<<<grid, block>>>(A_dev, B_dev);
  cudaDeviceSynchronize();
  iElaps = seconds() - iStart;
	printf("\nconv2D<<<(%d,%d), (%d,%d)>>> elapsed time %f sec \n\n", grid.x, grid.y, block.x, block.y, iElaps);
	CHECK(cudaGetLastError());

	CHECK(cudaMemcpy(B, B_dev, datasize, cudaMemcpyDeviceToHost));

#if DEBUG
	// print out data
	printf("Print results...\n");
	for (int i = 0; i < nH; i++) {
    if (i%8 == 0 && i>0)
      printf("\n");
		for (int j = 0; j < nW; j++)
      if (j%8 == 0 && j>0)
			  printf(" %0.2f ", B[i*nW+j]);
      else
        printf("%0.2f ", B[i*nW+j]);
		printf("\n");
	}
#endif

	cudaFree(A_dev);
	cudaFree(B_dev);
  cudaDeviceReset();
	free(A);
	free(B);
	return 0;
}



In [None]:
# Compilazione ed esecuzione
!nvcc -arch=sm_60  conv2D.cu -o conv2D
!./conv2D

In [None]:
!nvprof conv2D