# Meeting August 21st

## One weird GPU bug when launching kernel

```
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
```

After encountering this bug, I need to restart Julia because this bug would be reported for all other GPU kernel functions


## Memory through-put of D2y

I think this is ideal enough
```
julia> tester_D2y(2000)
y ≈ y_gpu = true
y ≈ y_gpu_2 = true
y ≈ y_gpu_7 = true
Float64(t1) = 4.43101793e8
Float64(t2) = 4.64824109e8
Float64(t3) = 5.526723e6
Float64(t7) = 5.161289e6
t1 / t2 = 0.9532676649523788
t1 / t3 = 80.17441673845424
t1 / t7 = 85.85099439306731
CPU Through-put                 1.44
GPU Through-put                 1.38
GPU (v2) Through-put               115.80
GPU (v7) Through-put               124.00
(4.43101793e8, 4.64824109e8, 5.526723e6)


julia> tester_D2y(10000)
y ≈ y_gpu = true
y ≈ y_gpu_2 = true
y ≈ y_gpu_7 = true
Float64(t1) = 2.6545772606e10
Float64(t2) = 5.8792959517e10
Float64(t3) = 1.3529684e8
Float64(t7) = 1.2718651e8
t1 / t2 = 0.4515127801709707
t1 / t3 = 196.2039365147035
t1 / t7 = 208.71531584599657
CPU Through-put                 0.60
GPU Through-put                 0.27
GPU (v2) Through-put               118.26
GPU (v7) Through-put               125.80
(2.6545772606e10, 5.8792959517e10, 1.3529684e8, 1.2718651e8)
```

for Nx=Ny= 10000
Time for D2y() (matrix-free on CPU): 26.5s
Time for D2x() (matrix-free on GPU with shared memory): 0.127s

In [3]:
function D2y_GPU_v7(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
	tidx = threadIdx().x
	tidy = threadIdx().y

	i = (blockIdx().x - 1) * TILE_DIM1 + tidx
	j = (blockIdx().y - 1) * TILE_DIM2 + tidy

	global_index = i + (j-1)*Nx

	HALO_WIDTH = 1
	tile = @cuStaticSharedMem(eltype(d_u),(TILE_DIM1+2*HALO_WIDTH,TILE_DIM2))

	k = tidx
	l = tidy

	# Writing pencil-shaped shared memory

	# for tile itself
	if k <= TILE_DIM1 && l <= TILE_DIM2 && global_index <= Nx*Ny
		tile[k+HALO_WIDTH,l] = d_u[global_index]
	end

	sync_threads()

	# For upper halo
	if k <= HALO_WIDTH && l <= TILE_DIM2 && HALO_WIDTH + 1 <= global_index <= Nx*Ny + HALO_WIDTH
		tile[k,l] = d_u[global_index - HALO_WIDTH]
	end

	sync_threads()

	# For lower halo
	if k >= TILE_DIM1 - HALO_WIDTH && l <= TILE_DIM2 && HALO_WIDTH + 1 <= global_index <= Nx*Ny - HALO_WIDTH
		tile[k+2*HALO_WIDTH,l] = d_u[global_index + HALO_WIDTH]
	end

	sync_threads()


	# Finite Difference Operations starts here

	#Upper Boundary
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && i == 1 && j <= Ny
		d_y[global_index] = (tile[k+HALO_WIDTH,l] - 2*tile[k+HALO_WIDTH+1,l] + tile[k+HALO_WIDTH+2,l]) / h^2
	end

	sync_threads()

	#Center
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && 2 <= i <= Nx-1 && j <= Ny
		d_y[global_index] = (tile[k+HALO_WIDTH-1,l] - 2*tile[k+HALO_WIDTH,l] + tile[k+HALO_WIDTH+1,l]) / h^2
	end

	sync_threads()

	#Lower Boundary
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && i == Nx && j <= Ny
		d_y[global_index] = (tile[k+HALO_WIDTH-2,l] - 2*tile[k+HALO_WIDTH-1,l] + tile[k+HALO_WIDTH,l]) / h^2
	end

	sync_threads()
	
	nothing

end

function tester_D2y(Nx)
	# Nx = Ny = 1000;
	Ny = Nx
	u = randn(Nx * Ny)
	d_u = CuArray(u)
	d_y = similar(d_u)
	d_y2 = similar(d_u)
	d_y7 = similar(d_u)
	h = 1/Nx
	TILE_DIM=32
	t1 = 0
	t2 = 0
	t3 = 0

	TILE_DIM_1 = 16
	TILE_DIM_2 = 4

	rep_times = 10

	THREAD_NUM = 32
	BLOCK_NUM = div(Nx * Ny,TILE_DIM) + 1

	griddim = (div(Nx,TILE_DIM_1)+1,div(Ny,TILE_DIM_2)+1)
	blockdim = (TILE_DIM_1,TILE_DIM_2)

	y = D2y(u,Nx,Ny,h)
	@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU(d_u,d_y,Nx,Ny,h,Val(TILE_DIM))
	y_gpu = collect(d_y)
	@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU_v5(d_u,d_y2,Nx,Ny,h,Val(TILE_DIM))
	synchronize()
	y_gpu_2 = collect(d_y2)
	@cuda threads=blockdim blocks=griddim D2y_GPU_v7(d_u,d_y7, Nx, Ny, h, Val(TILE_DIM_1), Val(TILE_DIM_2))
	y_gpu_7 = collect(d_y7)
	# @show y_gpu - y
	# @show y_gpu_2 - y
	@show y ≈ y_gpu
	@show y ≈ y_gpu_2
	@show y ≈ y_gpu_7


	ty = time_ns()
	for i in 1:rep_times
		y = D2x(u,Nx,Ny,h)
	end
	ty_end = time_ns()
	t1 = ty_end - ty
	t_dy = time_ns()
	for i in 1:rep_times
		@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU(d_u,d_y,Nx,Ny,h,Val(TILE_DIM))
	end
	synchronize()
	# sync_threads()
	t_dy_end = time_ns()
	t2 = t_dy_end - t_dy

	t_dy_v2 = time_ns()
	for i in 1:rep_times
		@cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU_v5(d_u,d_y2,Nx,Ny,h,Val(TILE_DIM))
	end
	synchronize()
	# sync_threads()
	t_dy_v2_end = time_ns()
	t3 = t_dy_v2_end - t_dy_v2

	t_dy_v7 = time_ns()
	for i in 1:rep_times
		@cuda threads=blockdim blocks=griddim D2y_GPU_v7(d_u,d_y7, Nx, Ny, h, Val(TILE_DIM_1), Val(TILE_DIM_2))
	end
	synchronize()
	t_dy_v7_end = time_ns()
	t7 = t_dy_v7_end - t_dy_v7

	@show Float64(t1)
	@show Float64(t2)
	@show Float64(t3)
	@show Float64(t7)

	@show t1/t2
	@show t1/t3
	@show t1/t7

	memsize = length(u) * sizeof(eltype(u))
	@printf("CPU Through-put %20.2f\n", 2 * memsize * rep_times / t1)
	@printf("GPU Through-put %20.2f\n", 2 * memsize * rep_times / t2)
	@printf("GPU (v2) Through-put %20.2f\n", 2 * memsize * rep_times / t3)
	@printf("GPU (v7) Through-put %20.2f\n", 2 * memsize * rep_times / t7)

	return Float64(t1), Float64(t2), Float64(t3), Float64(t7)
end

LoadError: UndefVarError: @cuStaticSharedMem not defined

## Unrolling does not have better performance here

```
UNROLL_NUM = 0

	@unroll for m = 0:UNROLL_NUM
		if k + m <= TILE_DIM1 && l <= TILE_DIM2 # &&  global_index <= Nx*Ny
			@inbounds tile[tidx+m, tidy+HALO_WIDTH] = d_u[global_index + m]
		end
	end

```

Unrolling for reading data into shared memory in D2x_GPU(), through-put decreases as we increase UNROLL_NUM (UNROLL_NUM <= TILE_DIM1 - 1)


```
	if global_index <= Nx*Ny
		d_y[global_index] = tile[k+HALO_WIDTH,l]
	end # Check if copying data to tile is correct, checked

	UNROLL_NUM = TILE_DIM2 - 1

	for m = 1:UNROLL_NUM
		if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l+m <= TILE_DIM2 && i == 1 && j <= Ny
			d_y[global_index + m*Nx] = (tile[k+HALO_WIDTH,l+m] - 2*tile[k+HALO_WIDTH+1,l+m] + tile[k+HALO_WIDTH+2,l+m]) / h^2
		end

		if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l+m <= TILE_DIM2 && 2 <= i <= Nx-1 && j <= Ny
			d_y[global_index + m*Nx] = (tile[k+HALO_WIDTH-1,l+m] - 2*tile[k+HALO_WIDTH,l+m] + tile[k+HALO_WIDTH+1,l+m]) / h^2
		end

		if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l+m <= TILE_DIM2 && i == Nx && j <= Ny
			d_y[global_index + m*Nx] = (tile[k+HALO_WIDTH-2,l+m] - 2*tile[k+HALO_WIDTH-1,l+m] + tile[k+HALO_WIDTH,l+m]) / h^2
		end
	end
```

Unrolling for calculating derivatives using shared memory in D2y_GPU(), through-put decreases as we increase UNROLL_NUM (UNROLL_NUM <= TILE_DIM2 - 1)

## Performance using different shapes of pencil. D2y_GPU(), Nx = Ny = 2000


| TILE_DIM1 | TILE_DIM2 | Through-put (GB/s)|
| ---------  |  --------- | --------- |
| 16       |    16    |   116    |
| 16       |    8    |     121   |
| 16       |    4    |   125 |
| 16       |    2    |    86  |
| 16       |    1    |    47   |
|  64      |    4    |     115    |
|  32      |    4    |     120    |
|  16      |    4    |     125    |
|  8       |    4    |     81     | 
|  4       |    4    |     45     |
| 64       |    1    |    122     |
| 32       |    2    |    122     |
| 16       |    4    |    125     |
| 8        |    8    |    125     |
| 4        |   16    |    122     |
| 2        |   32    |   105      |


From the table, it seems that the total size of the pencil affects the performance most, and the shape of the pencil affects the performance less.




## Multiple Matrix-free functions in 1 kernel

```
function Operator_y_GPU(d_u, d_y, d2_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
	tidx = threadIdx().x
	tidy = threadIdx().y

	i = (blockIdx().x - 1) * TILE_DIM1 + tidx
	j = (blockIdx().y - 1) * TILE_DIM2 + tidy

	global_index = i + (j-1)*Nx

	HALO_WIDTH = 1
	tile = @cuStaticSharedMem(eltype(d_u),(TILE_DIM1+2*HALO_WIDTH,TILE_DIM2))

	k = tidx
	l = tidy

	# Writing pencil-shaped shared memory

	# for tile itself
	if k <= TILE_DIM1 && l <= TILE_DIM2 && global_index <= Nx*Ny
		@inbounds tile[k+HALO_WIDTH,l] = d_u[global_index]
	end

	sync_threads()

	# For upper halo
	if k <= HALO_WIDTH && l <= TILE_DIM2 && HALO_WIDTH + 1 <= global_index <= Nx*Ny + HALO_WIDTH
		@inbounds tile[k,l] = d_u[global_index - HALO_WIDTH]
	end

	sync_threads()

	# For lower halo
	if k >= TILE_DIM1 - HALO_WIDTH && l <= TILE_DIM2 && HALO_WIDTH + 1 <= global_index <= Nx*Ny - HALO_WIDTH
		@inbounds tile[k+2*HALO_WIDTH,l] = d_u[global_index + HALO_WIDTH]
	end

	sync_threads()

	# Finite Difference Operations starts here

	# For d2_y, output second order differential operators in y direction

	#Upper Boundary
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && i == 1 && j <= Ny
		@inbounds d2_y[global_index] = (tile[k+HALO_WIDTH,l] - 2*tile[k+HALO_WIDTH+1,l] + tile[k+HALO_WIDTH+2,l]) / h^2
		@inbounds d_y[global_index] = (tile[k+HALO_WIDTH+1,l] - tile[k+HALO_WIDTH,l]) / h
	end

	sync_threads()

	#Center
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && 2 <= i <= Nx-1 && j <= Ny
		@inbounds d2_y[global_index] = (tile[k+HALO_WIDTH-1,l] - 2*tile[k+HALO_WIDTH,l] + tile[k+HALO_WIDTH+1,l]) / h^2
		@inbounds d_y[global_index] = (tile[k+HALO_WIDTH+1,l] - tile[k+HALO_WIDTH-1,l]) / (2*h)
	end

	sync_threads()

	#Lower Boundary
	if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && i == Nx && j <= Ny
		@inbounds d2_y[global_index] = (tile[k+HALO_WIDTH-2,l] - 2*tile[k+HALO_WIDTH-1,l] + tile[k+HALO_WIDTH,l]) / h^2
		@inbounds d_y[global_index] = (tile[k+HALO_WIDTH,l] - tile[k+HALO_WIDTH-1,l]) / h
	end

	# For d_y, output first order differential operators in y direction

	# if k + HALO_WIDTH <= TILE_DIM1 + 2*HALO_WIDTH && l <= TILE_DIM2 && i == 1 && j <= Ny
	# 	d_y[global_index] = (tile[k+HALO_WIDTH+2,l] - tile[k+HALO_WIDTH+1]) / h
	# end

	nothing
end
```




```
function tester_Operator_y_GPU(Nx)
	Ny = Nx
	h = 1/Nx
	TILE_DIM_1 = 16
	TILE_DIM_2 = 4

	u = randn(Nx*Ny)
	d_u = CuArray(u)
	d_y = similar(d_u)
	d2_y = similar(d_u)
	d_y7 = similar(d_u)
	y = Dy(u,Nx,Ny,h)
	y2 = D2y(u,Nx,Ny,h)

	griddim = (div(Nx,TILE_DIM_1) + 1, div(Ny,TILE_DIM_2) + 1)
	blockdim = (TILE_DIM_1,TILE_DIM_2)

	TILE_DIM = 32
	THREAD_NUM = 32
	BLOCK_NUM = div(Nx * Ny,TILE_DIM) + 1

	@cuda threads=blockdim blocks=griddim Operator_y_GPU(d_u,d_y, d2_y, Nx,Ny,h,Val(TILE_DIM_1),Val(TILE_DIM_2))
	@show Array(d_y) ≈ Array(y)
	@show Array(d2_y) ≈ Array(y2)


	# Starting test

	rep_times = 10

	ty = time_ns()
	for i in 1:rep_times
		y = Dy(u,Nx,Ny,h)
	end
	ty_end = time_ns()
	t1 = ty_end - ty

	memsize = length(u) * sizeof(eltype(u))
	@printf("CPU Through-put (Dy) %20.2f\n", 2 * memsize * rep_times / t1)

	t2y = time_ns()
	for i in 1:rep_times
		y2 = D2y(u,Nx,Ny,h)
	end
	t2y_end = time_ns()
	t2 = t2y_end - t2y

	@printf("CPU Through-put (D2y) %20.2f\n", 2 * memsize * rep_times / t1)
	@printf("CPU through-put (Dy + D2y, serial) %20.2f\n", 2 * memsize * rep_times / (t1+t2))

	td2y = time_ns()
	for i in 1:rep_times
		@cuda threads=blockdim blocks=griddim D2y_GPU_v7(d_u,d_y7, Nx, Ny, h, Val(TILE_DIM_1), Val(TILE_DIM_2))
	end
	synchronize()
	td2y_end = time_ns()
	t3 = td2y_end - td2y
	@printf("GPU through-put (D2y_GPU_v7) %20.2f\n", 2 * memsize * rep_times / t3)

	t_GPUy = time_ns()
	for i in 1:rep_times
		@cuda threads=blockdim blocks=griddim Operator_y_GPU(d_u,d_y, d2_y, Nx,Ny,h,Val(TILE_DIM_1),Val(TILE_DIM_2))
	end
	synchronize()
	t_GPUy_end = time_ns()
	t4 = t_GPUy_end - t_GPUy
	@printf("GPU through-put (Operator_y_GPU) %20.2f\n", 2* memsize * rep_times / t4)

	return Float64(t1), Float64(t2), Float64(t3), Float64(t4)

	return nothing
end
```

```
julia> tester_Operator_y_GPU(2000)
Array(d_y) ≈ Array(y) = true
Array(d2_y) ≈ Array(y2) = true
CPU Through-put (Dy)                 2.41
CPU Through-put (D2y)                 2.41
CPU through-put (Dy + D2y, serial)                 1.21
GPU through-put (D2y_GPU_v7)                96.95
GPU through-put (Operator_y_GPU)                49.16
(2.65638208e8, 2.61584846e8, 6.601347e6, 1.3019436e7)
```




Removing the step of assigning calculation results to Dy, only do calculation

```
julia> tester_Operator_y_GPU(2000)
Array(d_y) ≈ Array(y) = false
Array(d2_y) ≈ Array(y2) = true
CPU Through-put (Dy)                 2.48
CPU Through-put (D2y)                 2.48
CPU through-put (Dy + D2y, serial)                 1.18
GPU through-put (D2y_GPU_v7)                96.84
GPU through-put (Operator_y_GPU)                80.08
(2.58328058e8, 2.85296933e8, 6.608578e6, 7.991919e6)
```


The difference is obvious ...

Loading two vector dy, d2_y and assign the results into them is very slow. Need better mechanism ...

