In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-wjpi0j4f
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-wjpi0j4f
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=492f6465d3ba4a26becce94cc15b10e0d497c355c7bc0b72d48ea2d00dd16cd5
  Stored in directory: /tmp/pip-ephem-wheel-cache-_wee2h45/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [4]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#define ARR_LEN 12

__global__ void add(int *a, int *b, int *c, int n) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;

  if(i < n)
    c[i] = a[i] + b[i];
}

int main() {
  // host copies of variables a, b & c
  int a[ARR_LEN] = {1,2,3,4,5,6,7,8,9,10,11,12};
  int b[ARR_LEN] = {1,2,3,4,5,6,7,8,9,10,11,12};

  // Separate arrays for the results of the 3 different kernel calls
  int c1[ARR_LEN];
  int c2[ARR_LEN];
  int c3[ARR_LEN];

 
  // device copies of variables a, b & c
  int *d_a, *d_b, *d_c;

  int size = ARR_LEN * sizeof(int);

  // Allocate space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // Copy inputs to device
  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

  // Launch add() kernel on GPU
  cudaError err;

  // 1a) Grid size as N  
  add<<<ARR_LEN, 1>>>(d_a, d_b, d_c, ARR_LEN);
  err = cudaMemcpy(&c1, d_c, size, cudaMemcpyDeviceToHost);

  // 1b) N threads within a block 
  add<<<1, ARR_LEN>>>(d_a, d_b, d_c, ARR_LEN);
  err = cudaMemcpy(&c2, d_c, size, cudaMemcpyDeviceToHost);

  // 1c) Keep the num of threads per block as 256, vary num of blocks to handle N elements. 
  add<<<ceil(ARR_LEN/256),256>>>(d_a, d_b, d_c, ARR_LEN);
  err = cudaMemcpy(&c3, d_c, size, cudaMemcpyDeviceToHost);

  if(err != cudaSuccess) 
    printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  
  printf("\na) Kernel 1: ");
  for(int i=0; i < ARR_LEN; i++)
      printf("%d, ", c1[i]);

  printf("\nb) Kernel 2: ");
  for(int i=0; i < ARR_LEN; i++)
    printf("%d, ", c2[i]);

  printf("\nc) Kernel 3: ");
  for(int i=0; i < ARR_LEN; i++)
      printf("%d, ", c3[i]);
  
  // Cleanup
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  return 0;
}


a) Kernel 1: 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 
b) Kernel 2: 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 
c) Kernel 3: 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 
