# BM40A1401 GPU Computing

## Erik Kuitunen

### Exercise 1

Import needed libraries.

In [477]:
#!pip install pycuda
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np

#### Task 1
Implement a kernel which takes two vectors A and B and adds them together to form a vector C.

Derfining the kernel.

In [478]:
modd = SourceModule("""
  __global__ void vector_addition( double* a, double* b, double* c, int n_elem) {

    for ( int i = threadIdx.x + blockIdx.x * blockDim.x; 
          i < n_elem; 
          i += gridDim.x * blockDim.x ) {
            
      c[i] = a[i] + b[i];
    
    }
  }
""")

Create data vectors and initalize thread and block sizes

In [479]:
N = 10 ** 5
a = np.random.randn(N).astype( float )
b = np.random.randn(N).astype( float )

block_dims = ( 1024, 1, 1 )
grid_dims = ( 64, 1, 1 )

Allocate memory to GPU and copy data to device

In [480]:
a_gpu = cuda.mem_alloc( a.size * a.dtype.itemsize )
cuda.memcpy_htod( a_gpu, a )

b_gpu = cuda.mem_alloc( a.size * a.dtype.itemsize )
cuda.memcpy_htod( b_gpu, b )

c = np.empty_like(a)
c_gpu = cuda.mem_alloc( a.size * a.dtype.itemsize )

Calling the CUDA kernel.

In [481]:
kernel = modd.get_function( "vector_addition" )

kernel( a_gpu, b_gpu, c_gpu, np.int32(N), block = block_dims, grid = grid_dims )

Copying the results back to host and verifying the results

In [482]:
cuda.memcpy_dtoh( c, c_gpu )

c_cpu = a + b

if ( c_cpu == c ).all():
    print( "The vectors are the same." )   
else:
    print( "The vector are not the same. Something is wrong." )

The vectors are the same.


#### Task 2

Implement a kernel which multiplies two matrices together.

Defining the kernel

In [483]:
modd = SourceModule("""
  __global__ void matrix_multiplication( const float* A, const float* B, float* C, int M, int N, int K) {
    
    int row = threadIdx.y; 
    int col = threadIdx.x;
    float C_elem = 0;
    
    if ( row > M-1 || col > P-1 ) {
      return;
    }
    
    for ( int ii = 0;             
        ii < N; 
        ++ii ) {
        
      float A_elem = A[ row * N + ii ];
      float B_elem = B[ col + P * ii ];

      C_elem += A_elem * B_elem;
    
    }
    
    C[ col + row * P ] = C_elem;
    
  } 
""")

Create data matrices and initalize thread and block sizes

In [484]:
BLOCK_SIZE = 16

M = 14   # Rows of A and C
N = 15   # Columns of A; rows of B
K = 16   # Columns of B and C

A = np.float32( np.random.rand( M, N ) )
B = np.float32( np.random.rand( N, K ) )

block_dims = ( BLOCK_SIZE, BLOCK_SIZE, 1 )
grid_dims = ( 1, 1, 1 )

Allocate memory and copy data from host to device 

In [485]:
A_gpu = cuda.mem_alloc( A.nbytes )
cuda.memcpy_htod( A_gpu, A )

B_gpu = cuda.mem_alloc( B.nbytes )
cuda.memcpy_htod( B_gpu, B )

C = np.empty( [ A.shape[0], B.shape[1] ], dtype = np.float32 )
C_gpu = cuda.mem_alloc( C.nbytes )


Calling the CUDA kernel.

In [486]:
kernel = modd.get_function( "matrix_multiplication" )

kernel( A_gpu, B_gpu, C_gpu, np.int32(M), np.int32(N), np.int32(P),
        block = block_dims, grid = grid_dims )

Copying the results back to host and verifying the results

In [487]:
cuda.memcpy_dtoh( C, C_gpu )

C_cpu = np.dot( A, B )

C_diff = abs( C_cpu - C )

if ( C_diff.all() < 10 ** -6 ):
    print( "The result matrices are (nearly) the same." )   
else:
    print( "The matrices are not the same. Something is wrong." )

The result matrices are (nearly) the same.


Differences in the order of $ 10^7 $ can be found from the results. Why?




#### Task 3

Extend the kernel from task 2 to use shared memory.

In [None]:
modd = SourceModule("""
  __global__ void matmul_sharedmem( const float* A, const float* B, float* C, int block_size ) {
    
    // Block indices, each block computes submatrix of C, C_sub
    int block_row = blockIdx.y;
    int block_col = blockIdx.x;
    
    // Thread indices. Each thread computes an element of C_sub
    int thread_row = threadIdx.y;
    int thread_col = threadIdx.x;
    
    // Looping through relevant submatrices to compute C_sub
    for (ii)
    
    
    
  } 
""")