In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [7]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


In [8]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-6qb_tobx
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-6qb_tobx
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=86624ed6e0cbc6b46278abf7540c207a99c4f39f16b19c00bac38d422201fff5
  Stored in directory: /tmp/pip-ephem-wheel-cache-5n4153de/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin


In [15]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [22]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

/* Q1: Write a CUDA kernels for Matrix Multiplication:
* a) Each row of resultant matrix is computed by one thread
* a) Each col of resultant matrix is computed by one thread
* a) Each element of resultant matrix is computed by one thread
*/
__global__ void matMultRow(int *a, int *b, int *prod, int M, int N, int P, int Q){
  int row = threadIdx.x;
  int prod_index, sum;

  if(N != P)
    return;

  for(int j = 0; j < Q; j++){
    sum = 0;
    for(int k = 0; k < N; k++){
      sum += a[row * N + k] * b[k * Q + j];
    }

    prod_index = row * Q + j;
    prod[prod_index] = sum;
  }
}

__global__ void matMultCol(int *a, int *b, int *prod, int M, int N, int P, int Q){
  int col = threadIdx.x;
  int prod_index, sum;

  if(N != P)
    return;

  for(int i = 0; i < M; i++){
    sum = 0;
    for(int k = 0; k < N; k++){
      sum += a[i * N + k] * b[k * Q + col];
    }

    prod_index = i * Q + col;
    prod[prod_index] = sum;
  }
}

__global__ void matMultElement(int *a, int *b, int *prod, int M, int N, int P, int Q){
  int row = threadIdx.x;
  int col = threadIdx.y;
  int prod_index = row * N + col;

  if(N != P)
    return;

  int sum = 0;
  for(int k = 0; k < N; k++)
    sum += a[row * N + k] * b[k * Q + col];
  
  prod[prod_index] = sum;
}

int main() {
  int M = 1, N = 2, P = 2, Q = 3;

  // host copies of matrices a, b
  int a[M][N] = {1,2};
  int b[P][Q] = {{1,2,3},{4,5,6}};

  // Separate arrays for the results of the 3 different kernel calls
  int prod1[M][Q];
  int prod2[M][Q];
  int prod3[M][Q];
 
  // device copies of variables a, b & prod
  int *d_a, *d_b, *d_prod;
  int sizeA = M * N * sizeof(int);
  int sizeB = P * Q * sizeof(int);
  int sizeProd = M * Q * sizeof(int);

  // Allocate space for device copies of a, b, prod
  cudaMalloc((void **)&d_a, sizeA);
  cudaMalloc((void **)&d_b, sizeB);
  cudaMalloc((void **)&d_prod, sizeProd);

  // Copy inputs to device
  cudaMemcpy(d_a, a, sizeA, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, sizeB, cudaMemcpyHostToDevice);

  // Launch kernels on GPU:
  cudaError err;

  // a) A thread for each row
  matMultRow<<<1, M>>>(d_a, d_b, d_prod, M, N, P, Q);
  err = cudaMemcpy(&prod1, d_prod, sizeProd, cudaMemcpyDeviceToHost);

  // b) A thread for each col
  matMultCol<<<1, Q>>>(d_a, d_b, d_prod, M, N, P, Q);
  err = cudaMemcpy(&prod2, d_prod, sizeProd, cudaMemcpyDeviceToHost);

  // c) A thread for each element
  dim3 dimBlock(M,Q,1);
  matMultElement<<<1, dimBlock>>>(d_a, d_b, d_prod, M, N, P, Q);
  err = cudaMemcpy(&prod3, d_prod, sizeProd, cudaMemcpyDeviceToHost);

  if(err != cudaSuccess) 
    printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  
  int i, j;
  printf("Matrix A (MxN): %d x %d\n",M,N);
  for(i = 0; i < M; i++){
      for(j = 0; j < N; j++)
        printf("%d ", a[i][j]);
      printf("\n");
  }

  printf("\nMatrix B (PxQ): %d x %d\n",P,Q);
  for(i = 0; i < P; i++){
      for(j = 0; j < Q; j++)
        printf("%d ", b[i][j]);
      printf("\n");
  }

  printf("\nOne thread per row:\n");
  for(i = 0; i < M; i++){
      for(j = 0; j < Q; j++)
        printf("%d ", prod1[i][j]);
      printf("\n");
  }

  printf("\nOne thread per col:\n");
  for(i = 0; i < M; i++){
      for(j = 0; j < Q; j++)
        printf("%d ", prod2[i][j]);
      printf("\n");
  }

  printf("\nOne thread per element:\n");
  for(i = 0; i < M; i++){
      for(j = 0; j < Q; j++)
        printf("%d ", prod3[i][j]);
      printf("\n");
  }

  // Cleanup
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_prod);
  return 0;
}

Matrix A (MxN): 1 x 2
1 2 

Matrix B (PxQ): 2 x 3
1 2 3 
4 5 6 

One thread per row:
9 12 15 

One thread per col:
9 12 15 

One thread per element:
9 12 15 

