**Commands**

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin


Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-4dl7vel5
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-4dl7vel5
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=eaaf3d998edbbf61a5da2aebebe13db61b6d978e0897c4a78a926f4709a616ae
  Stored in directory: /tmp/pip-ephem-wheel-cache-qb_nlcu1/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


**Hello World Program**

In [None]:
%%cu
#include<stdio.h>
#include<stdlib.h>

__global__ void print1(void) 
{
    printf("Hello World - thread index = %d, block index = %d \n", threadIdx.x,blockIdx.x);
}

int main(void) 
{
    printf("Hello World from host!\n");
    print1<<<1,2>>>();
    cudaDeviceSynchronize();
    return 0;
}


Hello World from host!
Hello World - thread index = 0, block index = 0 
Hello World - thread index = 1, block index = 0 



**Hello World Program - With multiple blocks**

In [None]:
%%cu
#include<stdio.h>
#include<stdlib.h>

__global__ void print1(void) 
{
    printf("Hello World - thread index = %d, block index = %d \n", threadIdx.x,blockIdx.x);
}

int main(void) 
{
    printf("Hello World from host!\n");
    print1<<<2,2>>>();
    cudaDeviceSynchronize();
    //printf
    return 0;
}

Hello World from host!
Hello World - thread index = 0, block index = 0 
Hello World - thread index = 1, block index = 0 
Hello World - thread index = 0, block index = 1 
Hello World - thread index = 1, block index = 1 



**Effect of cudaDeviceSynchronize**

In [None]:
%%cu
#include<stdio.h>
#include<stdlib.h>

__global__ void print1(void) 
{
    printf("Hello World - thread index = %d, block index = %d \n", threadIdx.x,blockIdx.x);
}

int main(void) 
{
    printf("Hello World from host!\n");
    print1<<<2,2>>>();
    printf("hi");
    cudaDeviceSynchronize();
    return 0;
}

Hello World from host!
hiHello World - thread index = 0, block index = 0 
Hello World - thread index = 1, block index = 0 
Hello World - thread index = 0, block index = 1 
Hello World - thread index = 1, block index = 1 



**Adding two numbers**

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) 
{
  *c = *a + *b;
}

int main() 
{

  int a, b, c;
  int *d_a, *d_b, *d_c;
  int size = sizeof(int);

  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  c = 0;
  a = 3;
  b = 5;

  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
  
  add<<<1,1>>>(d_a, d_b, d_c);
  cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) 
  {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
  printf("result is %d\n",c);

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  return 0;
}

result is 8



**Adding two vectors**

In [None]:
%%cu
#include <stdio.h>

#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )

static void HandleError( cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess)
      {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}



const short N = 10 ;

__global__ void Vector_Addition ( const int *dev_a , const int *dev_b , int *dev_c)
{
      //Get the id of thread within a block
      unsigned short tid = threadIdx.x ;
     
     // check the boundry condition for the threads
      if ( tid < N ) 
            dev_c [tid] = dev_a[tid] + dev_b[tid] ;
//printf("%p \n", &N);

}


int main (void)
{

      int Host_a[N], Host_b[N], Host_c[N];

      int *dev_a , *dev_b, *dev_c ;

      HANDLE_ERROR ( cudaMalloc((void **)&dev_a , N*sizeof(int) ) );
      HANDLE_ERROR ( cudaMalloc((void **)&dev_b , N*sizeof(int) ) );
      HANDLE_ERROR ( cudaMalloc((void **)&dev_c , N*sizeof(int) ) );

      for ( int i = 0; i <N ; i++ )
      {
            Host_a[i] = -i ;
            Host_b[i] = i*i ; 
      }

      HANDLE_ERROR (cudaMemcpy (dev_a , Host_a , N*sizeof(int) , cudaMemcpyHostToDevice));
      HANDLE_ERROR (cudaMemcpy (dev_b , Host_b , N*sizeof(int) , cudaMemcpyHostToDevice));

      Vector_Addition <<< 1, N  >>> (dev_a , dev_b , dev_c ) ;

      HANDLE_ERROR (cudaMemcpy(Host_c , dev_c , N*sizeof(int) , cudaMemcpyDeviceToHost));

      for ( int i = 0; i<N; i++ )
      printf ("%d + %d = %d\n", Host_a[i] , Host_b[i] , Host_c[i] ) ;

      cudaFree (dev_a) ;
      cudaFree (dev_b) ;
      cudaFree (dev_c) ;
printf("%p \n", &N);

      return 0 ;

}


0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72
0x55a450f7803e 



**Adding two vectors using multiple blocks**

In [None]:
%%cu
/**
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}

/**
 * Host main routine
 */
int
main(void)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;

    // Print the vector length to be used, and compute its size
    int numElements = 50000;
    size_t size = numElements * sizeof(float);
    printf("[Vector addition of %d elements]\n", numElements);

    // Allocate the host input vector A
    float *h_A = (float *)malloc(size);

    // Allocate the host input vector B
    float *h_B = (float *)malloc(size);

    // Allocate the host output vector C
    float *h_C = (float *)malloc(size);

    // Verify that allocations succeeded
    if (h_A == NULL || h_B == NULL || h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    // Allocate the device input vector A
    float *d_A = NULL;
    err = cudaMalloc((void **)&d_A, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device input vector B
    float *d_B = NULL;
    err = cudaMalloc((void **)&d_B, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device output vector C
    float *d_C = NULL;
    err = cudaMalloc((void **)&d_C, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the host input vectors A and B in host memory to the device input vectors in
    // device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Verify that the result vector is correct
    for (int i = 0; i < numElements; ++i)
    {
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    // Free device global memory
    err = cudaFree(d_A);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_B);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_C);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);

    // Reset the device and exit
    err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}


[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Done



Unified Memory / Identify the mistake

In [None]:
%%cu
#include <stdio.h>
__managed__  int  x;   // unified memory 


__global__ void GPU_func( )
{
   printf("GPU sees x = %d\n", x);

   x = 4444;
}

int main()
{
   x = 1234;

   GPU_func<<< 1, 1 >>>( );   // Start GPU function

   printf("CPU sees x = %d\n", x);
   cudaDeviceSynchronize();   



   return 0;
}

GPU sees x = 1234
CPU sees x = 4444



In [None]:
%%cu
#include <stdio.h>




const short N = 10 ;

__managed__ int a[N], b[N], c[N];

__global__ void Vector_Addition ( const int *dev_a , const int *dev_b , int *dev_c)
{
      //Get the id of thread within a block
      unsigned short tid = threadIdx.x ;
     
     // check the boundry condition for the threads
      if ( tid < N ) 
            c [tid] = a[tid] + b[tid] ;

}


int main (void)
{

      for ( int i = 0; i <N ; i++ )
      {
            a[i] = -i ;
            b[i] = i*i ; 
      }

    
      Vector_Addition <<< 1, N  >>> (a , b , c ) ;

      cudaDeviceSynchronize();

      for ( int i = 0; i<N; i++ )
      printf ("%d + %d = %d\n", a[i] , b[i] , c[i] ) ;

   

      return 0 ;

}


0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72



Without passing arguments to device

In [None]:
%%cu
#include <stdio.h>




const short N = 10 ;

__managed__ int a[N], b[N], c[N];

__global__ void Vector_Addition ()
{
      //Get the id of thread within a block
      unsigned short tid = threadIdx.x ;
     
     // check the boundry condition for the threads
      if ( tid < N ) 
            c [tid] = a[tid] + b[tid] ;

}


int main (void)
{

      for ( int i = 0; i <N ; i++ )
      {
            a[i] = -i ;
            b[i] = i*i ; 
      }

    
      Vector_Addition <<< 1, N  >>> ( ) ;

      cudaDeviceSynchronize();

      for ( int i = 0; i<N; i++ )
      printf ("%d + %d = %d\n", a[i] , b[i] , c[i] ) ;

   

      return 0 ;

}


0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72

