# August 14th Meeting


In [1]:
## Finite Difference Operator with Shared Memory

using CUDA

function D2x_GPU_v5(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
	tidx = threadIdx().x
	tidy = threadIdx().y

	i = (blockIdx().x - 1) * TILE_DIM1 + tidx
	j = (blockIdx().y - 1) * TILE_DIM2 + tidy

	global_index = i + (j-1)*Ny

	# i = (blockIdx().x - 1) * TILE_DIM + threadIdx().x
	tile = @cuStaticSharedMem(eltype(d_u),(TILE_DIM1,TILE_DIM2+4))

	k = tidx
	l = tidy

	# Writing pencil-shaped shared memory

	# for tile itself
	if k <= TILE_DIM1 && l <= TILE_DIM2 && global_index <= Nx*Ny
		tile[k,l+2] = d_u[global_index]
	end

	sync_threads()

	# for left halo
	if k <= TILE_DIM1 && l <= 2 && 2*Ny+1 <= global_index <= (Nx+2)*Ny
		tile[k,l] = d_u[global_index - 2*Ny]
	end

	sync_threads()


	# for right halo
	if k <= TILE_DIM1 && l >= TILE_DIM2 - 2 && 2*Ny+1 <= global_index <= (Nx-2)*Ny
		tile[k,l+4] = d_u[global_index + 2*Ny]
	end

	sync_threads()

	# Finite difference operation starts here

	if k <= TILE_DIM1 && l + 2 <= TILE_DIM2 + 4 && global_index <= Ny
		d_y[global_index] = (tile[k,l + 2] - 2*tile[k,l+3] + tile[k,l+4]) / h^2
	end

	if k <= TILE_DIM1 &&  l + 2 <= TILE_DIM2 + 4 && Ny+1 <= global_index <= (Nx-1)*Ny
		d_y[global_index] = (tile[k,l + 1] - 2*tile[k, l + 2] + tile[k,l+3]) / h^2
	end

	if k <= TILE_DIM1 && l + 2 <= TILE_DIM2 + 4 && (Nx-1)*Ny + 1 <= global_index <= Nx*Ny
		d_y[global_index] = (tile[k,l] - 2*tile[k,l + 1] + tile[k,l+2]) / h^2
	end

	sync_threads()

	nothing
end

D2x_GPU_v5 (generic function with 1 method)

This works, but the performance is not ideal. I believe it has something to do with how I write data into shared memory.

This is how it's done in C++ code. I tried something similar, but there were bugs. So I go with my current implementation that doesn't do copy past within shared memory for halo.


```
// fill in periodic images in shared memory array 
  if (i < 4) {
    s_f[sj][si-4]  = s_f[sj][si+mx-5];
    s_f[sj][si+mx] = s_f[sj][si+1];   
  }
```