# August 27th Meeting


## Status

1. I finished developping GPU kernels with shared memory for all 2nd order SBP operators
    - Unify GPU kernels with tester function to test them all
    - Verified all implemented functions
    - FACEtoVOL and VOLtoFACE couldn't be implemented in GPU with good performance, so I used CPU version
    
    
Sample Code:
Sample 1.

Unified GPU function names and variable names
 
``` 
function D2x_GPU(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM}) where {TILE_DIM}
end

function D2x_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

function Dx_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

function Hxinv_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

function Hx_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

function D2y_GPU(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM}) where {TILE_DIM}
end

function D2y_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

function Dy_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

function Hyinv_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

function Hy_GPU_shared(d_u, d_y, Nx, Ny, h, ::Val{TILE_DIM1}, ::Val{TILE_DIM2}) where {TILE_DIM1, TILE_DIM2}
end

```


Sample 2. tester_function

```
function tester_function(f,Nx,TILE_DIM_1,TILE_DIM_2,TILE_DIM)
    Ny = Nx
	@show f
	@eval gpu_function = $(Symbol(f,"_GPU"))
	@eval gpu_function_shared = $(Symbol(f,"_GPU_shared"))
	@show gpu_function
    @show gpu_function_shared
    h = 1/Nx
	# TILE_DIM_1 = 16
	# TILE_DIM_2 = 2

	u = randn(Nx*Ny)
	d_u = CuArray(u)
	d_y = similar(d_u)
	d_y2 = similar(d_u)

	griddim = (div(Nx,TILE_DIM_1) + 1, div(Ny,TILE_DIM_2) + 1)
	blockdim = (TILE_DIM_1,TILE_DIM_2)

	# TILE_DIM = 32
	THREAD_NUM = TILE_DIM
    BLOCK_NUM = div(Nx * Ny,TILE_DIM)+1 
    
	y = f(u,Nx,Ny,h)
	@cuda threads=THREAD_NUM blocks=BLOCK_NUM gpu_function(d_u, d_y, Nx, Ny, h, Val(TILE_DIM))
    @cuda threads=blockdim blocks=griddim gpu_function_shared(d_u, d_y2, Nx, Ny, h, Val(TILE_DIM_1), Val(TILE_DIM_2))
	@show y ≈ Array(d_y)
	@show y ≈ Array(d_y2)
	@show y - Array(d_y2)
	
	rep_times = 10

	t_y = time_ns()
	for i in 1:rep_times
		y = f(u,Nx,Ny,h)
	end
	t_y_end = time_ns()
	t1 = t_y_end - t_y

	memsize = length(u) * sizeof(eltype(u))
	@show Float64(t1)
	@printf("CPU Through-put %20.2f\n", 2 * memsize * rep_times / t1)


	println()

	t_d_y = time_ns()
	for i in 1:rep_times
		@cuda threads=THREAD_NUM blocks=BLOCK_NUM gpu_function(d_u, d_y, Nx, Ny, h, Val(TILE_DIM))
		# @cuda threads=THREAD_NUM blocks=BLOCK_NUM D2y_GPU_v2(d_u, d_y, Nx, Ny, h, Val(TILE_DIM))
	end
	synchronize()
	t_d_y_end = time_ns()
	t2 = t_d_y_end - t_d_y
	@show Float64(t2)
	@show Float64(t1)/Float64(t2)
	@printf("GPU Through-put (naive) %20.2f\n", 2 * memsize * rep_times / t2)

	println()

	t_d_y2 = time_ns()
	for i in 1:rep_times
		@cuda threads=blockdim blocks=griddim gpu_function_shared(d_u, d_y2, Nx, Ny, h, Val(TILE_DIM_1), Val(TILE_DIM_2))
	end
	synchronize()
	t_d_y2_end = time_ns()
	t3 = t_d_y2_end - t_d_y2

	@show Float64(t3)
	@show Float64(t1)/Float64(t3)
	@printf("GPU Through-put (shared memory)%20.2f\n", 2 * memsize * rep_times / t3)

end
```


2. I developed myMAT_beta_GPU!(u) function to assemble GPU kernels and work with conjugate_beta_GPU!()
    - The output should be the left-hand-side A*u
    - All GPU kernels can work and myMAT_beta_GPU!(u) can work with conjugate_beta_GPU!() to do conjugate gradient
    - There are some bugs returning GPU arrays as outputs of the function myMAT_beta_GPU!(). Still trying to debug this part
    - There are lots of CPU-GPU array conversions in the current version. Future work would require redesign of the structure and containers to reuse allocated resources better and reduce CPU-GPU array conversions
    - Re-examined the previous matrix-free CPU code. There is room for optimization
    
    
Sample code 3. myMAT_beta_GPU!()
    
```
function myMAT_beta_GPU!(du::AbstractVector, u::AbstractVector, container, var_test) # , intermediates_GPU_mutable)
    @unpack N, y_D2x, y_D2y, y_Dx, y_Dy, y_Hxinv, y_Hyinv, yv2f1, yv2f2, yv2f3, yv2f4, yv2fs, yf2v1, yf2v2, yf2v3, yf2v4, yf2vs, y_Bx, y_By, y_BxSx, y_BySy, y_BxSx_tran, y_BySy_tran, y_Hx, y_Hy = container
    @unpack Nx,Ny,N,hx,hy,alpha1,alpha2,alpha3,alpha4,beta = var

    N = Nx*Ny
    cu_zeros = CuArray(zeros(N))
    iGm = intermediates_GPU_mutable(Nx,Ny,N,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros,cu_zeros);

    TILE_DIM_1 = 4
    TILE_DIM_2 = 16

    griddim_x = (div(Nx,TILE_DIM_1) + 1, div(Ny,TILE_DIM_2) + 1)
    griddim_y = (div(Nx,TILE_DIM_2) + 1, div(Ny,TILE_DIM_1) + 1)

    blockdim_x = (TILE_DIM_1,TILE_DIM_2)
    blockdim_y = (TILE_DIM_2,TILE_DIM_1)

    # @show typeof(u)
    # @show typeof(iGm.du_x)
    # @show blockdim_x
    # @show griddim_x
    # @show size(u)
    # @show size(iGm.du_x)
    @cuda threads=blockdim_x blocks=griddim_x D2x_GPU_shared(u,iGm.du_x, Nx, Ny, hx, Val(TILE_DIM_1), Val(TILE_DIM_2))
    # @show Array(iGm.du_x)
    output = Array(iGm.du_x)
    synchronize()
    @cuda threads=blockdim_y blocks=griddim_y D2y_GPU_shared(u,iGm.du_y, Nx, Ny, hy, Val(TILE_DIM_2), Val(TILE_DIM_1))
    synchronize()
    du_ops = iGm.du_x + iGm.du_y
    output2 = Array(du_ops)
    @cuda threads=blockdim_y blocks=griddim_y BySy_GPU_shared(u,iGm.du1, Nx, Ny, hy, Val(TILE_DIM_2), Val(TILE_DIM_1))
    synchronize()
    iGm.du2 .= CuArray(VOLtoFACE_beta(Array(iGm.du1),1,Nx,Ny,N,yv2fs))
    @cuda threads=blockdim_y blocks=griddim_y Hyinv_GPU_shared(iGm.du2,iGm.du3,Nx,Ny,hy, Val(TILE_DIM_2), Val(TILE_DIM_1))
    synchronize()
    iGm.du3 = alpha1 * iGm.du3

    iGm.du5 = VOLtoFACE_beta(Array(iGm.du1),2,Nx,Ny,N,yv2fs)
    @cuda threads=blockdim_y blocks=griddim_y Hyinv_GPU_shared(iGm.du5,iGm.du6,Nx,Ny,hy, Val(TILE_DIM_2), Val(TILE_DIM_1))
    synchronize()
    iGm.du6 = alpha2 * iGm.du6

    iGm.du7 = CuArray(VOLtoFACE_beta(Array(u),3,Nx,Ny,N,yv2fs))
    @cuda threads=blockdim_x blocks=griddim_x BxSx_tran_GPU_shared(iGm.du7,iGm.du8,Nx,Ny,hx,Val(TILE_DIM_1), Val(TILE_DIM_2))
    synchronize()
    @cuda threads=blockdim_x blocks=griddim_x Hxinv_GPU_shared(iGm.du8,iGm.du9,Nx,Ny,hx, Val(TILE_DIM_1), Val(TILE_DIM_2))
    synchronize()
    iGm.du9 = beta * iGm.du9

    @cuda threads=blockdim_x blocks=griddim_x Hxinv_GPU_shared(iGm.du7,iGm.du11,Nx,Ny,hx, Val(TILE_DIM_1), Val(TILE_DIM_2))
    synchronize()
    iGm.du11 =alpha3 * iGm.du11

    du12 = CuArray(VOLtoFACE_beta(Array(u),4,Nx,Ny,N,yv2fs))
    @cuda threads=blockdim_x blocks=griddim_x BxSx_tran_GPU_shared(iGm.du12,iGm.du13,Nx,Ny,hx,Val(TILE_DIM_1), Val(TILE_DIM_2))
    synchronize()
    @cuda threads=blockdim_x blocks=griddim_x Hxinv_GPU_shared(iGm.du13,iGm.du14,Nx,Ny,hx,Val(TILE_DIM_1), Val(TILE_DIM_2))
    synchronize()
    iGm.du14 = alpha4 * iGm.du14
    @cuda threads=blockdim_x blocks=griddim_x Hxinv_GPU_shared(iGm.du12,iGm.du16,Nx,Ny,hx,Val(TILE_DIM_1), Val(TILE_DIM_2))
    synchronize()
    iGm.du16 = alpha4 * iGm.du16
    iGm.du0 = du_ops + iGm.du3 + iGm.du6 + iGm.du9 + iGm.du11 + iGm.du14 + iGm.du16
    @cuda threads=blockdim_y blocks=griddim_x Hy_GPU_shared(iGm.du0,iGm.du17,Nx,Ny,hx,Val(TILE_DIM_1),Val(TILE_DIM_2))
    synchronize()
    @cuda threads=blockdim_x blocks=griddim_x Hx_GPU_shared(iGm.du17,iGm.du,Nx,Ny,hx,Val(TILE_DIM_2),Val(TILE_DIM_2))
    synchronize()
    # return Array(iGm.du_x)
    # @show output
    output_final = Array(iGm.du_y)
    return output_final
    # return output2
end
```


Sample code 4. conjugate_beta_GPU() function

```
function conjugate_beta_GPU(myMAT_beta_GPU!,r,b,container,var,intermediate,maxIteration)
    @unpack N, y_D2x, y_D2y, y_Dx, y_Dy, y_Hxinv, y_Hyinv, yv2f1, yv2f2, yv2f3, yv2f4, yv2fs, yf2v1, yf2v2, yf2v3, yf2v4, yf2vs, y_Bx, y_By, y_BxSx, y_BySy, y_BxSx_tran, y_BySy_tran, y_Hx, y_Hy = container
    @unpack Nx,Ny,N,hx,hy,alpha1,alpha2,alpha3,alpha4,beta = var
    # @unpack du_ops,du1,du2,du3,du4,du5,du6,du7,du8,du9,du10,du11,du12,du13,du14,du15,du16,du17,du0 = intermediate

    # u = zeros(N);
    # du = zeros(N);
    u = CuArray(zeros(N))
    du = CuArray(zeros(N))
    tol = 1e-16

    r .= b .- Array(myMAT_beta_GPU!(du,u,container,var))
    p = copy(r)
    Ap = similar(u)
    rsold = r'*r
    counts = 0
    # maxIteration = 1000
    for i = 1:maxIteration
        Ap = Array(myMAT_beta_GPU!(du,CuArray(p),container,var))   # can't simply translate MATLAB code, p = r create a link from p to r, once p modified, r will be modified
        Ap = Array(Ap)
        alpha = rsold / (p'*Ap)
        #u = u + alpha * p
        axpy!(alpha,p,Array(u)) # BLAS function
        #r = r - alpha * Ap
        axpy!(-alpha,Ap,r)
        rsnew = r'*r
        if sqrt(rsnew) < tol
            break
        end
        #p = r + (rsnew/rsold) * p
        #p .= r .+ (rsnew/rsold) .*p
        p .= (rsnew/rsold) .* p .+ r

        rsold = rsnew;
        counts += 1
        #return rsold;
    end
    return u, counts
end
```

