In [41]:
# !pip install pycuda

In [42]:
kernel_source = """
#define TILE_SIZE 4 //This can be changes to 8, 12 or 32 depending on the GPU

__global__ void tiled_mat_mul (int *a, int *b, int *c, int N1, int N2, int N3)
{
  //shared memory tile blocks for A and B

  __shared__ int tileA[TILE_SIZE][TILE_SIZE];
  __shared__ int tileB[TILE_SIZE][TILE_SIZE];


  int tx = threadIdx.x;
  int ty = threadIdx.y;

// working on C[i,j]

  int row =blockIdx.y*TILE_SIZE + threadIdx.y;
  int col =blockIdx.x*TILE_SIZE + threadIdx.x;

  int temp = 0;

  //Loop over all tiles  required to compute the C[row][col]

  for (int phase=0 ; phase<(N2+TILE_SIZE-1)/TILE_SIZE; phase++)
  {

    //load tiles into shared memory
    if ((row<N1)&& ((phase*TILE_SIZE+tx)<N2))
      tileA[ty][tx]=a[(row)*N2+phase*TILE_SIZE+tx];
    else
      tileA[ty][tx]=0;

    if ( ((phase*TILE_SIZE+ty)<N2)&&(col<N3))
        tileB[ty][tx]=b[(phase*TILE_SIZE+ty)*N3+col];
      else
        tileB[ty][tx]=0;
    __syncthreads();

    //Dot product
    for (int k=0;k<TILE_SIZE;k++)
    {
      temp+=tileA[ty][k]*tileB[k][tx];
    }
    __syncthreads();

  }

  //assigning the calculated value
  if ((row<N1)&&(col<N3))
  {
    c[row*N3+col]=temp;
  }

  }"""



In [43]:
import pycuda.autoinit
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time
TILE_SIZE= 4

In [44]:
print("Compiling the CUDA Kernel....")
#compile the cuda

mode =  SourceModule(kernel_source)

#Get the compiler function from the compile module
mat_mul= mode.get_function('tiled_mat_mul')


print("kernel compiled successfully")

Compiling the CUDA Kernel....
kernel compiled successfully


In [45]:
#Load the matirices for A and B

M=100
N=10
P=100
np.random.seed(1)
mat1 = np.random.randint(0, 10, size=(M,N)).astype(np.int32)
mat2= np.random.randint(0, 10, size=(N,P)).astype(np.int32)
mat3= np.empty_like(np.zeros((M,P))).astype(np.int32)


print(f"The matrices are loaded with shape {mat1.shape} {mat2.shape} {mat3.shape}")

The matrices are loaded with shape (100, 10) (10, 100) (100, 100)


In [46]:
#Initilize the variables in GPU and copy the matrices to GPU
a=cuda.mem_alloc(mat1.nbytes)
b=cuda.mem_alloc(mat2.nbytes)
c=cuda.mem_alloc(mat3.nbytes)

print("memory on gpu allocated successfully")

#Copy the matrices to GPU
cuda.memcpy_htod(a,mat1)
cuda.memcpy_htod(b,mat2)
cuda.memcpy_htod(c,mat3)

print("Copying the matrices to the gpu was successful")


memory on gpu allocated successfully
Copying the matrices to the gpu was successful


In [47]:
#Execution in GPU
#Define the Block size and GridSize
BLOCK_SIZE=(TILE_SIZE,TILE_SIZE,1)
GRID_SIZE=((P + TILE_SIZE - 1) // TILE_SIZE,(M+TILE_SIZE-1)//TILE_SIZE)

#Stat the timer
start_time_gpu=time.time()

#Execute in gpu
mat_mul(a,b,c,np.int32(M),np.int32(N),np.int32(P),block=BLOCK_SIZE,grid=GRID_SIZE)

#Wait for the GPU to Finish the Process
cuda.Context.synchronize()

#stop the timer
end_time_gpu=time.time()

time_taken_for_gpu=end_time_gpu-start_time_gpu

print(f'Execution completed successfull in GPU')

Execution completed successfull in GPU


In [48]:
#Copy the resultant matrix back to CPU
cuda.memcpy_dtoh(mat3,c)

print("The matrix is copied successfully back to CPU")

The matrix is copied successfully back to CPU


In [49]:
#Matrix Multiplication in

#Stat the timer
start_time_cpu=time.time()

#Execute in cpu
mat4=mat1@mat2

#stop the timer
end_time_cpu=time.time()

time_taken_for_cpu=end_time_cpu-start_time_cpu


In [50]:
print(f"Time taken in GPU is {time_taken_for_gpu} \nTime taken in CPU is {time_taken_for_cpu}")


Time taken in GPU is 0.0002372264862060547 
Time taken in CPU is 0.0003228187561035156
