# BM40A1401 GPU Computing

## Erik Kuitunen

### Exercise 1

Import needed libraries.

In [84]:
#!pip install pycuda
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
import math
import timeit

#### Task 1
Implement vector differentiation similar to numpy diff function using shared memory.

Defining the kernel

In [85]:
kernel_code = ( """
  __global__ void vec_diff_sharedmem( const float* a, float* b, int data_size ) {
  
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int tId = threadIdx.x;
    float b_elem = 0;
    
    __shared__ float a_shared[ %(THREADS)s ];

    if ( index > data_size-2 ) {
      return;
    }
      
    // Each thread loads one element from global to shared mem
    a_shared[ tId ] = a[ index ];
     
    __syncthreads();
    
    // Handling the case, where calulation happens on the edge of a block or at the end of the data
    if ( tId == blockDim.x - 1 || index  == data_size - 2 ) {   
    
      float edge = a[ index + 1 ];
      
      b_elem = edge - a_shared[ tId ];
    
    } else {
      
      b_elem = a_shared[ tId + 1 ] - a_shared[ tId ];
      
    }
    
    __syncthreads();
    
    b[ index ] = b_elem;
     
  } 
  
  // Also non-shared memory version for comparison
  
  __global__ void vec_diff( const float* a, float* b, int data_size ) {
  
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    float b_elem = 0;

    if ( index > data_size-2 ) {
      return;
    }
    
    b_elem = a[ index + 1 ] - a[ index ];
      
    b[ index ] = b_elem;
     
  }
""" )



Create data vectors and initalize thread and block sizes

In [86]:
THREADS = 16*16

data_dim = 2**28

a = np.float32( np.random.rand( 1, data_dim ) )

grid_dim = math.ceil( data_dim / THREADS ) 

block_dims = ( THREADS, 1, 1 )
grid_dims = ( grid_dim, 1, 1 )

Allocate memory and copy data from host to device 

In [87]:

a_gpu = cuda.mem_alloc( a.nbytes )
cuda.memcpy_htod( a_gpu, a )

b = np.float32( np.empty( [ 1, data_dim-1] ) )
b_gpu = cuda.mem_alloc( b.nbytes )

b_noshared = np.float32( np.empty( [ 1, data_dim-1] ) )
b_gpu_noshared = cuda.mem_alloc( b_noshared.nbytes )
                        

Calling the CUDA kernel and copy the result back to host

In [88]:
# Specifying constant THREADS for shared memory kernel
kernel = kernel_code % {
        'THREADS': THREADS
        }

# Compile the shared memory kernel code
mod = SourceModule( kernel )

diff_gpu = mod.get_function( "vec_diff_sharedmem" )

# Measure gpu execution time
gpu_start = timeit.default_timer()   
 
diff_gpu( a_gpu, b_gpu, np.int32( data_dim ), 
        block = block_dims, grid = grid_dims )

gpu_time = ( timeit.default_timer() - gpu_start ) * 1000

cuda.memcpy_dtoh( b, b_gpu )

###### Doing the same as above for non-shared memory version
diff_gpu_noshared = mod.get_function( "vec_diff" )

# Measure gpu execution time
gpu_start = timeit.default_timer()   
 
diff_gpu_noshared( a_gpu, b_gpu_noshared, np.int32( data_dim ), 
                block = block_dims, grid = grid_dims )

gpu_time_noshared = ( timeit.default_timer() - gpu_start ) * 1000

cuda.memcpy_dtoh( b_noshared, b_gpu_noshared )

Verifying the results

In [89]:

# Measure cpu time
cpu_start = timeit.default_timer()  
 
b_cpu = np.diff( a )

cpu_time = ( timeit.default_timer() - cpu_start ) * 1000

if ( b_cpu == b ).all() and (b_cpu == b_noshared ).all():
    print( "The vectors are the same. \nCPU: " + str( cpu_time )+ " ms\nGPU, no shared memory: " 
            + str( gpu_time_noshared ) + " ms\nGPU, shared memory: " + str( gpu_time ) + " ms")   
else:
    print( "The vector are not the same. Something is wrong." )

The vectors are the same. 
CPU: 178.29549999441952 ms
GPU, no shared memory: 0.12549999519251287 ms
GPU, shared memory: 0.34339999547228217 ms


Shared memory version is slower. Bug in code, or is this kind of problem just generally more inefficient with shared memory? On the other hand, if running no shared memory first the shared memory version last, shared is (sometimes) faster. Why?

#### Task 2
Implement the three reduction models presented in the lectures. Time their performance against different vector sizes. Execution times can vary between executions, so run them for example 100 times and take the average time.

Add also CPU performance with numpy sum() - function.

In [90]:
data_sizes = np.array( [ 2**10, 2**13, 2**16, 2**20, 2**22 ] )

exec_times_gpu = np.zeros( ( 4, np.size( data_sizes ) ) )
exec_times_cpu = np.empty_like( exec_times_gpu )

threads_per_block = 1024

Defining the kernels

In [91]:
kernel_code = ( """
               
  __global__ void interleaved( const float*, int stride ) {
  
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int tId = threadIdx.x;
    
    
     
  }
  
  __global__ void sequential( const float* a, int stride ) {
  
    
     
  }
  
  __global__ void interleaved_shared( const float* a, int stride ) {
  
    
     
  }
  
  __global__ void sequential_shared( const float* a, int stride ) {
  
    
     
  }
  
""" )

In [92]:

sum_kernel = SourceModule( kernel_code ).get_function( 'interleaved' )

# Looping through data sizes, doig the calculations and saving results.
ii = 0
jj = 0
for size in data_sizes:    
    
    result = np.zeros_like( )
    
    # Specifying thread and block dimensions for kernel call
    block_dims = [ threads_per_block, 1, 1 ]
    grid_dims = [ math.ceil( size / threads_per_block), 1, 1 ]
    
    # Create data and allocate memory
    a = np.random.randn( 1, size ).astype( np.float32 )
    a_gpu = cuda.mem_alloc( a.nbytes )
    
    cuda.memcpy_htod( a_gpu, a )
    
    sum_kernel( a, block = block_dims, grid = grid_dims )
    
    cuda.memcpy_dtoh( result, a_gpu )
    
    exec_times_gpu[ii][jj] = result[0]


TypeError: zeros_like() missing 1 required positional argument: 'a'

#### Task 3:

Implement Sequential reduction with shared memory. Add the performance to the same plot used in the previous task.

