More GPU Performance Optimization 

# A list of GPUs has been tested by the following Kernel, it's part of our opensource CFD framwork [XFLUIDS](https://github.com/XFluids/XFluids):
## llvm-16.0.6 is implemented on A100, amd Rocm-5.4.1 on RX6800XT, without competitive performance over CUDA/HIP model, we are now using multi-pass compile system

## the Performance listed here:
![图片1](https://github.com/AdaptiveCpp/AdaptiveCpp/assets/50227398/b75ace19-0e72-4e73-a103-bcb17978ce55)

## Kernel Code listed here:
```
#define Emax 5
#define NUM_COP 0
#define _DF(a) a
using real_t = double;

#include <sycl.hpp>

// // AdaptiveCpp HIP Target
#ifdef __HIPSYCL_ENABLE_HIP_TARGET__
#define __LBMt 256 // <=256 for HIP
using vendorError_t = hipError_t;
using vendorDeviceProp = hipDeviceProp_t;
using vendorFuncAttributes = hipFuncAttributes;

#define _VENDOR_KERNEL_ __global__ __launch_bounds__(256, 1)

#define vendorSuccess hipSuccess;
#define vendorSetDevice(A) hipSetDevice(A)
#define vendorGetLastError() hipGetLastError()
#define vendorDeviceSynchronize() hipDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) hipFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) hipGetDeviceProperties(A, B)

#define CheckGPUErrors(call)                                                             \
	{                                                                                    \
		hipError_t hipStatus = call;                                                     \
		if (hipSuccess != hipStatus)                                                     \
		{                                                                                \
			fprintf(stderr,                                                              \
					"ERROR: CUDA RT call \"%s\" in line %d of file %s failed "           \
					"with "                                                              \
					"%s (%d).\n",                                                        \
					#call, __LINE__, __FILE__, hipGetErrorString(hipStatus), hipStatus); \
			exit(EXIT_FAILURE);                                                          \
		}                                                                                \
	}                                                                                    \
	while (0)

// // OpenSYCL CUDA Target
#elif defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
#define __LBMt 256 // <=256 for HIP
using vendorError_t = cudaError_t;
using vendorDeviceProp = cudaDeviceProp;
using vendorFuncAttributes = cudaFuncAttributes;

#define _VENDOR_KERNEL_ __global__

#define vendorSuccess cudaSuccess;
#define vendorSetDevice(A) cudaSetDevice(A)
#define vendorGetLastError() cudaGetLastError()
#define vendorDeviceSynchronize() cudaDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) cudaFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) cudaGetDeviceProperties(A, B)

#define CheckGPUErrors(call)                                                                \
	{                                                                                       \
		cudaError_t cudaStatus = call;                                                      \
		if (cudaSuccess != cudaStatus)                                                      \
		{                                                                                   \
			fprintf(stderr,                                                                 \
					"ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
					"with "                                                                 \
					"%s (%d).\n",                                                           \
					#call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
			exit(EXIT_FAILURE);                                                             \
		}                                                                                   \
	}                                                                                       \
	while (0)

#else // defined(__HIPSYCL_ENABLE_HIP_TARGET__) || (__HIPSYCL_ENABLE_CUDA_TARGET__)

#define SYCL_KERNEL __host__ __device__
#define SYCL_DEVICE __host__ __device__
#define _VENDOR_KERNEL_LB_(A, B) __global__ __launch_bounds__(A, B)

#endif // end Marco

typedef struct
{
	int Xmax, Ymax, Zmax;
	int X_inner, Y_inner, Z_inner;
	int Bwidth_X, Bwidth_Y, Bwidth_Z;
} MeshSize;

SYCL_DEVICE inline real_t weno5old_BODY(const real_t v1, const real_t v2, const real_t v3, const real_t v4, const real_t v5)
{
	real_t a1, a2, a3;
	real_t dtwo = _DF(2.0), dtre = _DF(3.0);

	a1 = v1 - dtwo * v2 + v3;
	real_t s1 = _DF(13.0) * a1 * a1;
	a1 = v1 - _DF(4.0) * v2 + dtre * v3;
	s1 += dtre * a1 * a1;
	a1 = v2 - dtwo * v3 + v4;
	real_t s2 = _DF(13.0) * a1 * a1;
	a1 = v2 - v4;
	s2 += dtre * a1 * a1;
	a1 = v3 - dtwo * v4 + v5;
	real_t s3 = _DF(13.0) * a1 * a1;
	a1 = dtre * v3 - _DF(4.0) * v4 + v5;
	s3 += dtre * a1 * a1;

	real_t tol = _DF(1.0E-6);
	s1 += tol, s2 += tol, s3 += tol;

	// // for double precision
	a1 = _DF(0.1) * s2 * s2 * s3 * s3;
	a2 = _DF(0.2) * s1 * s1 * s3 * s3;
	a3 = _DF(0.3) * s1 * s1 * s2 * s2;

	real_t tw1 = _DF(1.0) / (a1 + a2 + a3);
	a1 = a1 * tw1, a2 = a2 * tw1, a3 = a3 * tw1;

	s1 = a1 * (dtwo * v1 - _DF(7.0) * v2 + _DF(11.0) * v3);
	s2 = a2 * (-v2 + _DF(5.0) * v3 + dtwo * v4);
	s3 = a3 * (dtwo * v3 + _DF(5.0) * v4 - v5);

	return (s1 + s2 + s3);
}

SYCL_DEVICE inline real_t weno5old_GPU(real_t *f, real_t *m)
{
	int k;
	real_t v1, v2, v3, v4, v5, temf, temm;
	// ff
	k = 0;
	v1 = *(f + k - 2);
	v2 = *(f + k - 1);
	v3 = *(f + k);
	v4 = *(f + k + 1);
	v5 = *(f + k + 2);

	temf = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;
	// mm
	k = 1;
	v1 = *(m + k + 2);
	v2 = *(m + k + 1);
	v3 = *(m + k);
	v4 = *(m + k - 1);
	v5 = *(m + k - 2);

	temm = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;

	return (temf + temm) * _six;
}

SYCL_DEVICE inline void RoeAverageLeft_x(int const n, real_t *eigen_l, real_t &eigen_value, real_t *z, const real_t *yi, real_t const c2,
										 real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
										 real_t const b1, real_t const b3, real_t Gamma)
{
	real_t q2 = _u * _u + _v * _v + _w * _w;
	real_t _c = sycl::sqrt(c2);
	real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
	real_t _c1 = _DF(1.0) / _c;
	real_t _u_c = _u * _c1;

	if (0 == n)
	{
		eigen_l[0] = _DF(0.5) * (b2 + _u_c + b3);
		eigen_l[1] = -_DF(0.5) * (b1 * _u + _c1);
		eigen_l[2] = -_DF(0.5) * (b1 * _v);
		eigen_l[3] = -_DF(0.5) * (b1 * _w);
		eigen_l[4] = _DF(0.5) * b1;
		eigen_value = sycl::fabs(_u - _c);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
	}
	else if (1 == n)
	{
		eigen_l[0] = (_DF(1.0) - b2 - b3) / b1;
		eigen_l[1] = _u;
		eigen_l[2] = _v;
		eigen_l[3] = _w;
		eigen_l[4] = -_DF(1.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = z[m];
	}
	else if (2 == n)
	{
		eigen_l[0] = _v;
		eigen_l[1] = _DF(0.0);
		eigen_l[2] = -_DF(1.0);
		eigen_l[3] = _DF(0.0);
		eigen_l[4] = _DF(0.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (3 == n)
	{
		eigen_l[0] = -_w;
		eigen_l[1] = _DF(0.0);
		eigen_l[2] = _DF(0.0);
		eigen_l[3] = _DF(1.0);
		eigen_l[4] = _DF(0.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (Emax - 1 == n)
	{
		eigen_l[0] = _DF(0.5) * (b2 - _u_c + b3);
		eigen_l[1] = _DF(0.5) * (-b1 * _u + _c1);
		eigen_l[2] = _DF(0.5) * (-b1 * _v);
		eigen_l[3] = _DF(0.5) * (-b1 * _w);
		eigen_l[4] = _DF(0.5) * b1;
		eigen_value = sycl::fabs(_u + _c);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
	}
}

SYCL_DEVICE inline void RoeAverageRight_x(int const n, real_t *eigen_r, real_t *z, const real_t *yi, real_t const c2,
										  real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
										  real_t const b1, real_t const b3, real_t Gamma)
{
	real_t q2 = _u * _u + _v * _v + _w * _w;
	real_t _c = sycl::sqrt(c2);
	real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
	real_t _c1 = _DF(1.0) / _c;
	real_t _u_c = _u * _c1;

	if (0 == n)
	{
		eigen_r[0] = _DF(1.0);
		eigen_r[1] = _u - _c;
		eigen_r[2] = _v;
		eigen_r[3] = _w;
		eigen_r[4] = _H - _u * _c;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = yi[m];
	}
	else if (1 == n)
	{
		eigen_r[0] = b1;
		eigen_r[1] = _u * b1;
		eigen_r[2] = _v * b1;
		eigen_r[3] = _w * b1;
		eigen_r[4] = _H * b1 - _DF(1.0);
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = b1 * yi[m];
	}
	else if (2 == n)
	{
		eigen_r[0] = _DF(0.0);
		eigen_r[1] = _DF(0.0);
		eigen_r[2] = -_DF(1.0);
		eigen_r[3] = _DF(0.0);
		eigen_r[4] = -_v;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (3 == n)
	{
		eigen_r[0] = _DF(0.0);
		eigen_r[1] = _DF(0.0);
		eigen_r[2] = _DF(0.0);
		eigen_r[3] = _DF(1.0);
		eigen_r[4] = _w;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (Emax - 1 == n)
	{
		eigen_r[0] = _DF(1.0);
		eigen_r[1] = _u + _c;
		eigen_r[2] = _v;
		eigen_r[3] = _w;
		eigen_r[4] = _H + _u * _c;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = yi[m];
	}
}

extern SYCL_KERNEL void ReconstructFluxX(int i, int j, int k, real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
										 real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
	int Xmax = bl.Xmax;
	int Ymax = bl.Ymax;
	int Zmax = bl.Zmax;
	int X_inner = bl.X_inner;
	int Y_inner = bl.Y_inner;
	int Z_inner = bl.Z_inner;
	int Bwidth_X = bl.Bwidth_X;
	int Bwidth_Y = bl.Bwidth_Y;
	int Bwidth_Z = bl.Bwidth_Z;
	if (i >= X_inner + Bwidth_X)
		return;
	if (j >= Y_inner + Bwidth_Y)
		return;
	if (k >= Z_inner + Bwidth_Z)
		return;

	size_t id_l = Xmax * Ymax * k + Xmax * j + i;
	size_t id_r = Xmax * Ymax * k + Xmax * j + i + 1;

	// preparing some interval value for roe average
	real_t D = sycl::sqrt(rho[id_r] / rho[id_l]);
	real_t D1 = _DF(1.0) / (D + _DF(1.0));
	real_t _u = (u[id_l] + D * u[id_r]) * D1;
	real_t _v = (v[id_l] + D * v[id_r]) * D1;
	real_t _w = (w[id_l] + D * w[id_r]) * D1;
	real_t _H = (H[id_l] + D * H[id_r]) * D1;
	real_t _P = (p[id_l] + D * p[id_r]) * D1;
	real_t _rho = sycl::sqrt(rho[id_r] * rho[id_l]);
	real_t _yi[NUM_SPECIES] = {_DF(1.0)}, b3 = _DF(0.0), z[] = {_DF(0.0)};
	real_t Gamma0 = 1.4;
	real_t c2 = Gamma0 * _P / _rho; /*(_H - _DF(0.5) * (_u * _u + _v * _v + _w * _w));*/
	real_t b1 = (Gamma0 - _DF(1.0)) / c2;
	//     // construct the right value & the left value scalar equations by characteristic reduction
	//     // at i+1/2 in x direction
	real_t uf[10], ff[10], pp[10], mm[10], f_flux, _p[Emax][Emax], eigen_lr[Emax], eigen_value, artificial_viscosity;
	for (int n = 0; n < Emax; n++)
	{
		real_t eigen_local_max = _DF(0.0);
		RoeAverageLeft_x(n, eigen_lr, eigen_value, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
		for (int m = -stencil_P; m < stencil_size - stencil_P; m++)
		{
			int _i_1 = i + m, _j_1 = j, _k_1 = k; /* Xmax * Ymax * k + Xmax * j + i + m*/
			int id_local_1 = Xmax * Ymax * (_k_1) + Xmax * (_j_1) + (_i_1);
			eigen_local_max = sycl::max(eigen_local_max, sycl::fabs(eigen_local[Emax * id_local_1 + n])); /* local lax-friedrichs*/
		}
		artificial_viscosity = eigen_local_max;
		for (int m = -3; m <= 4; m++)
		{
			int _i_2 = i + m, _j_2 = j, _k_2 = k; /* Xmax * Ymax * k + Xmax * j + m + i; */
			int id_local = Xmax * Ymax * (_k_2) + Xmax * (_j_2) + (_i_2);
			uf[m + 3] = _DF(0.0);
			ff[m + 3] = _DF(0.0);
			for (int n1 = 0; n1 < Emax; n1++)
			{
				uf[m + 3] = uf[m + 3] + UI[Emax * id_local + n1] * eigen_lr[n1]; /* eigen_l actually */
				ff[m + 3] = ff[m + 3] + Fl[Emax * id_local + n1] * eigen_lr[n1];
			} /*  for local speed*/
			pp[m + 3] = _DF(0.5) * (ff[m + 3] + artificial_viscosity * uf[m + 3]);
			mm[m + 3] = _DF(0.5) * (ff[m + 3] - artificial_viscosity * uf[m + 3]);
		} /* calculate the scalar numerical flux at x direction*/
		f_flux = weno5old_GPU(&pp[3], &mm[3]);
		RoeAverageRight_x(n, eigen_lr, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
		for (int n1 = 0; n1 < Emax; n1++)
		{									   /* get Fp */
			_p[n][n1] = f_flux * eigen_lr[n1]; /* eigen_r actually */
		}
	}
	for (int n = 0; n < Emax; n++)
	{ /* reconstruction the F-flux terms*/
		real_t fluxl = _DF(0.0);
		for (int n1 = 0; n1 < Emax; n1++)
		{
			fluxl += _p[n1][n];
		}
		Fwall[Emax * id_l + n] = fluxl;
	}
}

#if __VENDOR_SUBMIT__
_VENDOR_KERNEL_LB_(__LBMt, 1)
void ReconstructFluxXVendorWrapper(real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
								   real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x + bl.Bwidth_X - 1;
	int j = blockIdx.y * blockDim.y + threadIdx.y + bl.Bwidth_Y;
	int k = blockIdx.z * blockDim.z + threadIdx.z + bl.Bwidth_Z;

	ReconstructFluxX(i, j, k, dl, bl, UI, Fl, Fwall, eigen_local, eigen_lt, eigen_rt, p, rho, u, v, w, y, T, H, eigen_block);
}
#endif

// test Parallelism submission:
#if __VENDOR_SUBMIT__
ReconstructFluxXVendorWrapper<<<temfwx.global_gd(global_ndrange_x), temfwx.local_blk>>>(bl.dx, ms, UI, FluxF, FluxFw,
																						eigen_local_x, p, rho, u, v, w,
																						fdata.y, T, H, eigen_block_x);
#else
q.submit([&](sycl::handler &h) {																						 //
	h.parallel_for(sycl::nd_range<3>(temfwx.global_nd(global_ndrange_x), temfwx.local_nd), [=](sycl::nd_item<3> index) { //
		int i = index.get_global_id(0) + ms.Bwidth_X - 1;
		int j = index.get_global_id(1) + ms.Bwidth_Y;
		int k = index.get_global_id(2) + ms.Bwidth_Z;
		ReconstructFluxX(i, j, k, bl.dx, ms, UI, FluxF, FluxFw, eigen_local_x, p, rho, u, v, w, fdata.y, T, H, eigen_block_x);
	});
});
#endif
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

More GPU Performance Optimization #1399

A list of GPUs has been tested by the following Kernel, it's part of our opensource CFD framwork XFLUIDS:

llvm-16.0.6 is implemented on A100, amd Rocm-5.4.1 on RX6800XT, without competitive performance over CUDA/HIP model, we are now using multi-pass compile system

the Performance listed here:

Kernel Code listed here:

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

More GPU Performance Optimization #1399

Description

A list of GPUs has been tested by the following Kernel, it's part of our opensource CFD framwork XFLUIDS:

llvm-16.0.6 is implemented on A100, amd Rocm-5.4.1 on RX6800XT, without competitive performance over CUDA/HIP model, we are now using multi-pass compile system

the Performance listed here:

Kernel Code listed here:

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions