Step 1 - Check the presence and version of NVidia C Compiler (nvcc)

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


Step 2 - Install the nvcc plugin

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-90uph1_z
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-90uph1_z
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=2e363c4ef2fa4ff413d0422216c20ffed57dbbb102d8f689c7db9b1f31234feb
  Stored in directory: /tmp/pip-ephem-wheel-cache-ot1un4ut/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


Step 3 - Load the nvcc plugin

In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


Step 4 - Attach Google Drive to save the text file with results

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


Step 5 - Run CUDA code

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <time.h>
#include <math.h>

#define N 10000
#define BLOCK_SIZE 16
#define WIDTH 100

__global__ void gpu_square_matrix_mult(int *d_a, int *d_b, int *d_c, int n)
{
	__shared__ float tile_a[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float tile_b[BLOCK_SIZE][BLOCK_SIZE];

	int tx = threadIdx.x; int ty = threadIdx.y;
	int bx = blockIdx.x; int by = blockIdx.y;

	// coordinates of specific block
	int row = by * BLOCK_SIZE + ty;
	int column = bx * BLOCK_SIZE + tx;

	int tmp = 0, idx;

	for(int i=0;i<ceilf(n/(float)BLOCK_SIZE);i++){
		// load tile a from Mat a
    idx = row*n + i*BLOCK_SIZE + tx;
		if(row < n && (i*BLOCK_SIZE + tx)<n)
			tile_a[ty][tx] = d_a[idx];
		else
			tile_a[ty][tx] = 0;

    // load tile b from Mat b
    idx = (i*BLOCK_SIZE + ty)*n + column;
		if(column < n && (i*BLOCK_SIZE + ty)<n)
			tile_b[ty][tx] = d_b[idx];
		else
			tile_b[ty][tx] = 0;

		// after the entire tile's values are available, proceed
		__syncthreads();

		for(int j=0;j<BLOCK_SIZE;j++)
			tmp += tile_a[ty][j] * tile_b[j][tx];

		// after the entire tile's values have been used, proceed
		__syncthreads();
	}
	// boundary check
	if(row < n && column < n)
		d_c[row*n+column] = tmp;
}

void populate(int *arr, int size){
    for(int i = 0; i < size; i++){
        arr[i] = (i+1) % 1000;
    }
}

int main() {
    int *a, *b, *c;

    // host copies of variables a, b & c
    int *d_a, *d_b, *d_c;

    // device copies of variables a, b & c
    int size = N * sizeof(int);

    struct timespec start, end;
    double time_taken = 0.0;

    // Allocate space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Setup input values
    a = (int *)malloc(size);
    b = (int *)malloc(size);
    c = (int *)malloc(size);

    clock_gettime(CLOCK_MONOTONIC, &start);

    // Initialize input values
    populate(a, N);
    populate(b, N);

    // Initialize result
    for(int i=0; i<N; i++) {
        c[i] = 0;
    }

    // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice);

    dim3 gridSize, blockSize;
    blockSize.x = blockSize.y = BLOCK_SIZE; blockSize.z = 1;
    gridSize.x = ceil(WIDTH/(float)blockSize.x);
    gridSize.y = ceil(WIDTH/(float)blockSize.y);
    gridSize.z = 1;

    // Launch matrix multiplication kernel on GPU
    gpu_square_matrix_mult<<<gridSize, blockSize>>>(d_a, d_b, d_c, WIDTH);

    // Copy result back to host
    cudaError err = cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    clock_gettime(CLOCK_MONOTONIC, &end);
    time_taken += (end.tv_sec - start.tv_sec);
    time_taken += (end.tv_nsec - start.tv_nsec) / 1000000.0;
    printf("Total time taken (milliseconds): %.8f", time_taken);

    if(err!=cudaSuccess) {
        printf("CUDA error copying to Host: %s\n",
        cudaGetErrorString(err));
    }
    else{
        FILE *fp;
        fp = fopen("/content/gdrive/My Drive/output_gpu.txt", "w");
        for(int i=0; i<N; i++) {
            fprintf(fp, "%d ", c[i]);
            if((i + 1) % WIDTH == 0) fprintf(fp, "\n");
        }
        fclose(fp);
    }


    // Cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}

Total time taken (milliseconds): 0.29757800
