>Refresh the Cloud Instance of CUDA On Server

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update



>Install CUDA Version 9

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2




>Check the Version of CUDA 

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


>Execute the given command to install a small extension to run nvcc from Notebook cells

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-n5hzf6_y
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-n5hzf6_y
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=bfab558f624a5f3d4bbab6352ca39f557de1e9a7e3db34be7cab4748c4657f36
  Stored in directory: /tmp/pip-ephem-wheel-cache-bcar73kc/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin


>Load the extension using this code:

In [None]:
%load_ext nvcc_plugin

directory /content/src already exists
Out bin /content/result.out


>To check the Code run the following snippet in

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

__global__ void add(int *a, int *b, int *c) 
{
    *c = *a + *b;
}

int main() 
{
    // host copies of variables a, b & c
    int a, b, c;
 
    // device copies of variables a, b & c
    int *d_a, *d_b, *d_c;
 
    // Allocate space for device copies of a, b, c
    int size = sizeof(int);
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);
  
    // Setup input values  
    c = 0;
    a = 3;
    b = 5;
  
    // Copy inputs to device
    cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
  
    // Launch add() kernel on GPU
    add<<<1,1>>>(d_a, d_b, d_c);
  
    // Copy result back to host
    cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
    if(err!=cudaSuccess) 
    {
        printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
    }
  
    printf("result is %d\n",c);
  
    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
  
    return 0;
}

result is 8



Vector Addition implementation



In [None]:
%%cu
#include "stdio.h"
#define N 10

void add(int *a, int *b, int *c)
{
    int tID = 0;
    while (tID < N)
    {
        c[tID] = a[tID] + b[tID];
        tID += 1;
    }
}

int main()
{
     int a[N], b[N], c[N];
    // Fill Arrays
    for (int i = 0; i < N; i++)
    {
        a[i] = i,
        b[i] = 1;
    }
    
    add (a, b, c);
    for (int i = 0; i < N; i++)
    {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }
    return 0;
}

0 + 1 = 1
1 + 1 = 2
2 + 1 = 3
3 + 1 = 4
4 + 1 = 5
5 + 1 = 6
6 + 1 = 7
7 + 1 = 8
8 + 1 = 9
9 + 1 = 10



2-D Array Addition

In [None]:
%%cu
#include "stdio.h"
#define COLUMNS 3
#define ROWS 2

__global__ void add(int *a, int *b, int *c)
{
    int x = blockIdx.x;
    int y = blockIdx.y;
    int i = (COLUMNS*y) + x;
    c[i] = a[i] + b[i];
}

int main()
{
    int a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS];
    int *dev_a, *dev_b, *dev_c;
    cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
    cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
    cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
    for (int y = 0; y < ROWS; y++) // Fill Arrays
        for (int x = 0; x < COLUMNS; x++)
        {
            a[y][x] = x;
            b[y][x] = y;
        }
    cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
    cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
    cudaMemcpyHostToDevice);
    dim3 grid(COLUMNS,ROWS);
    add<<<grid,1>>>(dev_a, dev_b, dev_c);
    cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),
    cudaMemcpyDeviceToHost);
    for (int y = 0; y < ROWS; y++) // Output Arrays
    {
        for (int x = 0; x < COLUMNS; x++)
        {
            printf("[%d][%d]=%d ",y,x,c[y][x]);
        }
        printf("\n");
    }
    return 0;
}

[0][0]=0 [0][1]=1 [0][2]=2 
[1][0]=1 [1][1]=2 [1][2]=3 

