# Meeting August 21st

## One weird GPU bug when launching kernel

```
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
```

After encountering this bug, I need to restart Julia because this bug would be reported for all other GPU kernel functions


## Memory through-put of D2y

I think this is ideal enough
```
julia> tester_D2y(2000)
y ≈ y_gpu = true
y ≈ y_gpu_2 = true
y ≈ y_gpu_7 = true
Float64(t1) = 4.43101793e8
Float64(t2) = 4.64824109e8
Float64(t3) = 5.526723e6
Float64(t7) = 5.161289e6
t1 / t2 = 0.9532676649523788
t1 / t3 = 80.17441673845424
t1 / t7 = 85.85099439306731
CPU Through-put                 1.44
GPU Through-put                 1.38
GPU (v2) Through-put               115.80
GPU (v7) Through-put               124.00
(4.43101793e8, 4.64824109e8, 5.526723e6)
```

In [3]:
function D2y_GPU_v7(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
	tidx = threadIdx().x
	tidy = threadIdx().y

	i = (blockIdx().x - 1) * TILE_DIM1 + tidx
	j = (blockIdx().y - 1) * TILE_DIM2 + tidy

	global_index = i + (j-1)*Nx

	HALO_WIDTH = 1
	tile = @cuStaticSharedMem(eltype(d_u),(TILE_DIM1+2*HALO_WIDTH,TILE_DIM2))

	k = tidx
	l = tidy

	# Writing pencil-shaped shared memory

	# for tile itself
	if k <= TILE_DIM1 && l <= TILE_DIM2 && global_index <= Nx*Ny
		tile[k+HALO_WIDTH,l] = d_u[global_index]
	end

	sync_threads()

	# For upper halo
	if k <= HALO_WIDTH && l <= TILE_DIM2 && HALO_WIDTH + 1 <= global_index <= Nx*Ny + HALO_WIDTH
		tile[k,l] = d_u[global_index - HALO_WIDTH]
	end

	sync_threads()

	# For lower halo
	if k >= TILE_DIM1 - HALO_WIDTH && l <= TILE_DIM2 && HALO_WIDTH + 1 <= global_index <= Nx*Ny - HALO_WIDTH
		tile[k+2*HALO_WIDTH,l] = d_u[global_index + HALO_WIDTH]
	end

	sync_threads()


	# Finite Difference Operations starts here

	#Upper Boundary
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && i == 1 && j <= Ny
		d_y[global_index] = (tile[k+HALO_WIDTH,l] - 2*tile[k+HALO_WIDTH+1,l] + tile[k+HALO_WIDTH+2,l]) / h^2
	end

	sync_threads()

	#Center
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && 2 <= i <= Nx-1 && j <= Ny
		d_y[global_index] = (tile[k+HALO_WIDTH-1,l] - 2*tile[k+HALO_WIDTH,l] + tile[k+HALO_WIDTH+1,l]) / h^2
	end

	sync_threads()

	#Lower Boundary
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && i == Nx && j <= Ny
		d_y[global_index] = (tile[k+HALO_WIDTH-2,l] - 2*tile[k+HALO_WIDTH-1,l] + tile[k+HALO_WIDTH,l]) / h^2
	end

	sync_threads()
	
	nothing

end

function tester_D2y(Nx)
	# Nx = Ny = 1000;
	Ny = Nx
	u = randn(Nx * Ny)
	d_u = CuArray(u)
	d_y = similar(d_u)
	d_y2 = similar(d_u)
	d_y7 = similar(d_u)
	h = 1/Nx
	TILE_DIM=32
	t1 = 0
	t2 = 0
	t3 = 0

	TILE_DIM_1 = 16
	TILE_DIM_2 = 4

	rep_times = 10

	THREAD_NUM = 32
	BLOCK_NUM = div(Nx * Ny,TILE_DIM) + 1

	griddim = (div(Nx,TILE_DIM_1)+1,div(Ny,TILE_DIM_2)+1)
	blockdim = (TILE_DIM_1,TILE_DIM_2)

	y = D2y(u,Nx,Ny,h)
	@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU(d_u,d_y,Nx,Ny,h,Val(TILE_DIM))
	y_gpu = collect(d_y)
	@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU_v5(d_u,d_y2,Nx,Ny,h,Val(TILE_DIM))
	synchronize()
	y_gpu_2 = collect(d_y2)
	@cuda threads=blockdim blocks=griddim D2y_GPU_v7(d_u,d_y7, Nx, Ny, h, Val(TILE_DIM_1), Val(TILE_DIM_2))
	y_gpu_7 = collect(d_y7)
	# @show y_gpu - y
	# @show y_gpu_2 - y
	@show y ≈ y_gpu
	@show y ≈ y_gpu_2
	@show y ≈ y_gpu_7


	ty = time_ns()
	for i in 1:rep_times
		y = D2x(u,Nx,Ny,h)
	end
	ty_end = time_ns()
	t1 = ty_end - ty
	t_dy = time_ns()
	for i in 1:rep_times
		@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU(d_u,d_y,Nx,Ny,h,Val(TILE_DIM))
	end
	synchronize()
	# sync_threads()
	t_dy_end = time_ns()
	t2 = t_dy_end - t_dy

	t_dy_v2 = time_ns()
	for i in 1:rep_times
		@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU_v5(d_u,d_y2,Nx,Ny,h,Val(TILE_DIM))
	end
	synchronize()
	# sync_threads()
	t_dy_v2_end = time_ns()
	t3 = t_dy_v2_end - t_dy_v2

	t_dy_v7 = time_ns()
	for i in 1:rep_times
		@cuda threads=blockdim blocks=griddim D2y_GPU_v7(d_u,d_y7, Nx, Ny, h, Val(TILE_DIM_1), Val(TILE_DIM_2))
	end
	synchronize()
	t_dy_v7_end = time_ns()
	t7 = t_dy_v7_end - t_dy_v7

	@show Float64(t1)
	@show Float64(t2)
	@show Float64(t3)
	@show Float64(t7)

	@show t1/t2
	@show t1/t3
	@show t1/t7

	memsize = length(u) * sizeof(eltype(u))
	@printf("CPU Through-put %20.2f\n", 2 * memsize * rep_times / t1)
	@printf("GPU Through-put %20.2f\n", 2 * memsize * rep_times / t2)
	@printf("GPU (v2) Through-put %20.2f\n", 2 * memsize * rep_times / t3)
	@printf("GPU (v7) Through-put %20.2f\n", 2 * memsize * rep_times / t7)

	return Float64(t1), Float64(t2), Float64(t3), Float64(t7)
end

LoadError: UndefVarError: @cuStaticSharedMem not defined