More GPU Performance Optimization #1399

Jiuxiaoyunhai · 2024-03-12T08:31:12Z

A list of GPUs has been tested by the following Kernel, it's part of our opensource CFD framwork XFLUIDS:

llvm-16.0.6 is implemented on A100, amd Rocm-5.4.1 on RX6800XT, without competitive performance over CUDA/HIP model, we are now using multi-pass compile system

the Performance listed here:

Kernel Code listed here:

#define Emax 5
#define NUM_COP 0
#define _DF(a) a
using real_t = double;

#include <sycl.hpp>

// // AdaptiveCpp HIP Target
#ifdef __HIPSYCL_ENABLE_HIP_TARGET__
#define __LBMt 256 // <=256 for HIP
using vendorError_t = hipError_t;
using vendorDeviceProp = hipDeviceProp_t;
using vendorFuncAttributes = hipFuncAttributes;

#define _VENDOR_KERNEL_ __global__ __launch_bounds__(256, 1)

#define vendorSuccess hipSuccess;
#define vendorSetDevice(A) hipSetDevice(A)
#define vendorGetLastError() hipGetLastError()
#define vendorDeviceSynchronize() hipDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) hipFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) hipGetDeviceProperties(A, B)

#define CheckGPUErrors(call)                                                             \
	{                                                                                    \
		hipError_t hipStatus = call;                                                     \
		if (hipSuccess != hipStatus)                                                     \
		{                                                                                \
			fprintf(stderr,                                                              \
					"ERROR: CUDA RT call \"%s\" in line %d of file %s failed "           \
					"with "                                                              \
					"%s (%d).\n",                                                        \
					#call, __LINE__, __FILE__, hipGetErrorString(hipStatus), hipStatus); \
			exit(EXIT_FAILURE);                                                          \
		}                                                                                \
	}                                                                                    \
	while (0)

// // OpenSYCL CUDA Target
#elif defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
#define __LBMt 256 // <=256 for HIP
using vendorError_t = cudaError_t;
using vendorDeviceProp = cudaDeviceProp;
using vendorFuncAttributes = cudaFuncAttributes;

#define _VENDOR_KERNEL_ __global__

#define vendorSuccess cudaSuccess;
#define vendorSetDevice(A) cudaSetDevice(A)
#define vendorGetLastError() cudaGetLastError()
#define vendorDeviceSynchronize() cudaDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) cudaFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) cudaGetDeviceProperties(A, B)

#define CheckGPUErrors(call)                                                                \
	{                                                                                       \
		cudaError_t cudaStatus = call;                                                      \
		if (cudaSuccess != cudaStatus)                                                      \
		{                                                                                   \
			fprintf(stderr,                                                                 \
					"ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
					"with "                                                                 \
					"%s (%d).\n",                                                           \
					#call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
			exit(EXIT_FAILURE);                                                             \
		}                                                                                   \
	}                                                                                       \
	while (0)

#else // defined(__HIPSYCL_ENABLE_HIP_TARGET__) || (__HIPSYCL_ENABLE_CUDA_TARGET__)

#define SYCL_KERNEL __host__ __device__
#define SYCL_DEVICE __host__ __device__
#define _VENDOR_KERNEL_LB_(A, B) __global__ __launch_bounds__(A, B)

#endif // end Marco

typedef struct
{
	int Xmax, Ymax, Zmax;
	int X_inner, Y_inner, Z_inner;
	int Bwidth_X, Bwidth_Y, Bwidth_Z;
} MeshSize;

SYCL_DEVICE inline real_t weno5old_BODY(const real_t v1, const real_t v2, const real_t v3, const real_t v4, const real_t v5)
{
	real_t a1, a2, a3;
	real_t dtwo = _DF(2.0), dtre = _DF(3.0);

	a1 = v1 - dtwo * v2 + v3;
	real_t s1 = _DF(13.0) * a1 * a1;
	a1 = v1 - _DF(4.0) * v2 + dtre * v3;
	s1 += dtre * a1 * a1;
	a1 = v2 - dtwo * v3 + v4;
	real_t s2 = _DF(13.0) * a1 * a1;
	a1 = v2 - v4;
	s2 += dtre * a1 * a1;
	a1 = v3 - dtwo * v4 + v5;
	real_t s3 = _DF(13.0) * a1 * a1;
	a1 = dtre * v3 - _DF(4.0) * v4 + v5;
	s3 += dtre * a1 * a1;

	real_t tol = _DF(1.0E-6);
	s1 += tol, s2 += tol, s3 += tol;

	// // for double precision
	a1 = _DF(0.1) * s2 * s2 * s3 * s3;
	a2 = _DF(0.2) * s1 * s1 * s3 * s3;
	a3 = _DF(0.3) * s1 * s1 * s2 * s2;

	real_t tw1 = _DF(1.0) / (a1 + a2 + a3);
	a1 = a1 * tw1, a2 = a2 * tw1, a3 = a3 * tw1;

	s1 = a1 * (dtwo * v1 - _DF(7.0) * v2 + _DF(11.0) * v3);
	s2 = a2 * (-v2 + _DF(5.0) * v3 + dtwo * v4);
	s3 = a3 * (dtwo * v3 + _DF(5.0) * v4 - v5);

	return (s1 + s2 + s3);
}

SYCL_DEVICE inline real_t weno5old_GPU(real_t *f, real_t *m)
{
	int k;
	real_t v1, v2, v3, v4, v5, temf, temm;
	// ff
	k = 0;
	v1 = *(f + k - 2);
	v2 = *(f + k - 1);
	v3 = *(f + k);
	v4 = *(f + k + 1);
	v5 = *(f + k + 2);

	temf = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;
	// mm
	k = 1;
	v1 = *(m + k + 2);
	v2 = *(m + k + 1);
	v3 = *(m + k);
	v4 = *(m + k - 1);
	v5 = *(m + k - 2);

	temm = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;

	return (temf + temm) * _six;
}

SYCL_DEVICE inline void RoeAverageLeft_x(int const n, real_t *eigen_l, real_t &eigen_value, real_t *z, const real_t *yi, real_t const c2,
										 real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
										 real_t const b1, real_t const b3, real_t Gamma)
{
	real_t q2 = _u * _u + _v * _v + _w * _w;
	real_t _c = sycl::sqrt(c2);
	real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
	real_t _c1 = _DF(1.0) / _c;
	real_t _u_c = _u * _c1;

	if (0 == n)
	{
		eigen_l[0] = _DF(0.5) * (b2 + _u_c + b3);
		eigen_l[1] = -_DF(0.5) * (b1 * _u + _c1);
		eigen_l[2] = -_DF(0.5) * (b1 * _v);
		eigen_l[3] = -_DF(0.5) * (b1 * _w);
		eigen_l[4] = _DF(0.5) * b1;
		eigen_value = sycl::fabs(_u - _c);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
	}
	else if (1 == n)
	{
		eigen_l[0] = (_DF(1.0) - b2 - b3) / b1;
		eigen_l[1] = _u;
		eigen_l[2] = _v;
		eigen_l[3] = _w;
		eigen_l[4] = -_DF(1.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = z[m];
	}
	else if (2 == n)
	{
		eigen_l[0] = _v;
		eigen_l[1] = _DF(0.0);
		eigen_l[2] = -_DF(1.0);
		eigen_l[3] = _DF(0.0);
		eigen_l[4] = _DF(0.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (3 == n)
	{
		eigen_l[0] = -_w;
		eigen_l[1] = _DF(0.0);
		eigen_l[2] = _DF(0.0);
		eigen_l[3] = _DF(1.0);
		eigen_l[4] = _DF(0.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (Emax - 1 == n)
	{
		eigen_l[0] = _DF(0.5) * (b2 - _u_c + b3);
		eigen_l[1] = _DF(0.5) * (-b1 * _u + _c1);
		eigen_l[2] = _DF(0.5) * (-b1 * _v);
		eigen_l[3] = _DF(0.5) * (-b1 * _w);
		eigen_l[4] = _DF(0.5) * b1;
		eigen_value = sycl::fabs(_u + _c);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
	}
}

SYCL_DEVICE inline void RoeAverageRight_x(int const n, real_t *eigen_r, real_t *z, const real_t *yi, real_t const c2,
										  real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
										  real_t const b1, real_t const b3, real_t Gamma)
{
	real_t q2 = _u * _u + _v * _v + _w * _w;
	real_t _c = sycl::sqrt(c2);
	real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
	real_t _c1 = _DF(1.0) / _c;
	real_t _u_c = _u * _c1;

	if (0 == n)
	{
		eigen_r[0] = _DF(1.0);
		eigen_r[1] = _u - _c;
		eigen_r[2] = _v;
		eigen_r[3] = _w;
		eigen_r[4] = _H - _u * _c;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = yi[m];
	}
	else if (1 == n)
	{
		eigen_r[0] = b1;
		eigen_r[1] = _u * b1;
		eigen_r[2] = _v * b1;
		eigen_r[3] = _w * b1;
		eigen_r[4] = _H * b1 - _DF(1.0);
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = b1 * yi[m];
	}
	else if (2 == n)
	{
		eigen_r[0] = _DF(0.0);
		eigen_r[1] = _DF(0.0);
		eigen_r[2] = -_DF(1.0);
		eigen_r[3] = _DF(0.0);
		eigen_r[4] = -_v;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (3 == n)
	{
		eigen_r[0] = _DF(0.0);
		eigen_r[1] = _DF(0.0);
		eigen_r[2] = _DF(0.0);
		eigen_r[3] = _DF(1.0);
		eigen_r[4] = _w;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (Emax - 1 == n)
	{
		eigen_r[0] = _DF(1.0);
		eigen_r[1] = _u + _c;
		eigen_r[2] = _v;
		eigen_r[3] = _w;
		eigen_r[4] = _H + _u * _c;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = yi[m];
	}
}

extern SYCL_KERNEL void ReconstructFluxX(int i, int j, int k, real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
										 real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
	int Xmax = bl.Xmax;
	int Ymax = bl.Ymax;
	int Zmax = bl.Zmax;
	int X_inner = bl.X_inner;
	int Y_inner = bl.Y_inner;
	int Z_inner = bl.Z_inner;
	int Bwidth_X = bl.Bwidth_X;
	int Bwidth_Y = bl.Bwidth_Y;
	int Bwidth_Z = bl.Bwidth_Z;
	if (i >= X_inner + Bwidth_X)
		return;
	if (j >= Y_inner + Bwidth_Y)
		return;
	if (k >= Z_inner + Bwidth_Z)
		return;

	size_t id_l = Xmax * Ymax * k + Xmax * j + i;
	size_t id_r = Xmax * Ymax * k + Xmax * j + i + 1;

	// preparing some interval value for roe average
	real_t D = sycl::sqrt(rho[id_r] / rho[id_l]);
	real_t D1 = _DF(1.0) / (D + _DF(1.0));
	real_t _u = (u[id_l] + D * u[id_r]) * D1;
	real_t _v = (v[id_l] + D * v[id_r]) * D1;
	real_t _w = (w[id_l] + D * w[id_r]) * D1;
	real_t _H = (H[id_l] + D * H[id_r]) * D1;
	real_t _P = (p[id_l] + D * p[id_r]) * D1;
	real_t _rho = sycl::sqrt(rho[id_r] * rho[id_l]);
	real_t _yi[NUM_SPECIES] = {_DF(1.0)}, b3 = _DF(0.0), z[] = {_DF(0.0)};
	real_t Gamma0 = 1.4;
	real_t c2 = Gamma0 * _P / _rho; /*(_H - _DF(0.5) * (_u * _u + _v * _v + _w * _w));*/
	real_t b1 = (Gamma0 - _DF(1.0)) / c2;
	//     // construct the right value & the left value scalar equations by characteristic reduction
	//     // at i+1/2 in x direction
	real_t uf[10], ff[10], pp[10], mm[10], f_flux, _p[Emax][Emax], eigen_lr[Emax], eigen_value, artificial_viscosity;
	for (int n = 0; n < Emax; n++)
	{
		real_t eigen_local_max = _DF(0.0);
		RoeAverageLeft_x(n, eigen_lr, eigen_value, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
		for (int m = -stencil_P; m < stencil_size - stencil_P; m++)
		{
			int _i_1 = i + m, _j_1 = j, _k_1 = k; /* Xmax * Ymax * k + Xmax * j + i + m*/
			int id_local_1 = Xmax * Ymax * (_k_1) + Xmax * (_j_1) + (_i_1);
			eigen_local_max = sycl::max(eigen_local_max, sycl::fabs(eigen_local[Emax * id_local_1 + n])); /* local lax-friedrichs*/
		}
		artificial_viscosity = eigen_local_max;
		for (int m = -3; m <= 4; m++)
		{
			int _i_2 = i + m, _j_2 = j, _k_2 = k; /* Xmax * Ymax * k + Xmax * j + m + i; */
			int id_local = Xmax * Ymax * (_k_2) + Xmax * (_j_2) + (_i_2);
			uf[m + 3] = _DF(0.0);
			ff[m + 3] = _DF(0.0);
			for (int n1 = 0; n1 < Emax; n1++)
			{
				uf[m + 3] = uf[m + 3] + UI[Emax * id_local + n1] * eigen_lr[n1]; /* eigen_l actually */
				ff[m + 3] = ff[m + 3] + Fl[Emax * id_local + n1] * eigen_lr[n1];
			} /*  for local speed*/
			pp[m + 3] = _DF(0.5) * (ff[m + 3] + artificial_viscosity * uf[m + 3]);
			mm[m + 3] = _DF(0.5) * (ff[m + 3] - artificial_viscosity * uf[m + 3]);
		} /* calculate the scalar numerical flux at x direction*/
		f_flux = weno5old_GPU(&pp[3], &mm[3]);
		RoeAverageRight_x(n, eigen_lr, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
		for (int n1 = 0; n1 < Emax; n1++)
		{									   /* get Fp */
			_p[n][n1] = f_flux * eigen_lr[n1]; /* eigen_r actually */
		}
	}
	for (int n = 0; n < Emax; n++)
	{ /* reconstruction the F-flux terms*/
		real_t fluxl = _DF(0.0);
		for (int n1 = 0; n1 < Emax; n1++)
		{
			fluxl += _p[n1][n];
		}
		Fwall[Emax * id_l + n] = fluxl;
	}
}

#if __VENDOR_SUBMIT__
_VENDOR_KERNEL_LB_(__LBMt, 1)
void ReconstructFluxXVendorWrapper(real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
								   real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x + bl.Bwidth_X - 1;
	int j = blockIdx.y * blockDim.y + threadIdx.y + bl.Bwidth_Y;
	int k = blockIdx.z * blockDim.z + threadIdx.z + bl.Bwidth_Z;

	ReconstructFluxX(i, j, k, dl, bl, UI, Fl, Fwall, eigen_local, eigen_lt, eigen_rt, p, rho, u, v, w, y, T, H, eigen_block);
}
#endif

// test Parallelism submission:
#if __VENDOR_SUBMIT__
ReconstructFluxXVendorWrapper<<<temfwx.global_gd(global_ndrange_x), temfwx.local_blk>>>(bl.dx, ms, UI, FluxF, FluxFw,
																						eigen_local_x, p, rho, u, v, w,
																						fdata.y, T, H, eigen_block_x);
#else
q.submit([&](sycl::handler &h) {																						 //
	h.parallel_for(sycl::nd_range<3>(temfwx.global_nd(global_ndrange_x), temfwx.local_nd), [=](sycl::nd_item<3> index) { //
		int i = index.get_global_id(0) + ms.Bwidth_X - 1;
		int j = index.get_global_id(1) + ms.Bwidth_Y;
		int k = index.get_global_id(2) + ms.Bwidth_Z;
		ReconstructFluxX(i, j, k, bl.dx, ms, UI, FluxF, FluxFw, eigen_local_x, p, rho, u, v, w, fdata.y, T, H, eigen_block_x);
	});
});
#endif

The text was updated successfully, but these errors were encountered:

illuhad · 2024-03-12T14:41:29Z

Where can we find the CUDA/HIP version of the code that the SYCL version was benchmarked against?
How and where exactly are you measuring the runtime?
For best performance, please consider using --acpp-targets=generic, not the older multipass flows.

Jiuxiaoyunhai · 2024-03-13T03:26:08Z

for the first: https://github.com/XFluids/XFluids/blob/master/src/solver_Reconstruction/FDM_Method/Reconstruction_kernels.hpp, line 8,

a sycl kernel function named ReconstructFluxX is tested, and for CUDA/HIP code, we package the ReconstructFluxX kernel into a CUDA/HIP experssion kernel function named ReconstructFluxXVendorWrapper, and we add the __launch_bounds() attribute, too, it's a marco function named VENDOR_KERNEL_LB(__LBMt, 1), and __LBMt is set to 256 for CUDA and HIP. A cmake option named VENDOR_SUBMIT of https://github.com/XFluids/XFluids/blob/master/CMakeLists.txt to control either SYCL parallelism(set VENDOR_SUBMIT OFF) or CUDA/HIP(set VENDOR_SUBMIT ON) parallelism. Additionally, we tested the Kernel to find an optimization local block shape(work-group size in SYCL) for kernel's executing, it's processed by XFLUIDS automatically.

for the seond:

well, a little complex to run XFLUIDS until a binary file thrown, then rerun XFLUIDS, the binary file is read for best performance, for short, you compile XFLUIDS(paltform and arch is given in line 7,8 of https://github.com/XFluids/XFluids/blob/master/CMakeLists.txt) in build, assuming build as current work dir

./XFLUIDS -dev=1,1,0 -run=2048,2048,0,50 , this is to test block shape and throw a binary file named: 2d-riemann-shocks-pp(off)-vis(off)-rode(off)AdaptiveRange(2048+4)x(2048+4)x(0+0) to /build/output/ , because of the multipass used, option "-dev" is used to select device found by acpp-info, the three number means: devices' number only given > 1 if mpirun is executed, the second number is the platform to be selected (CPU or GPU), the three number is device id, this function is used to select device and construct a queue:
sycl::queue q = sycl::queue(sycl::platform::get_platforms()[pid].get_devices()[did]);
value of the second number of "-dev" option is given to pid, and the 3rd given to did
./XFLUIDS -dev=1,1,0 -run=2048,2048,0,20, this is to measure the runtime, XFLUIDS automatically read the binary file mentioned before, we get the runtime of "SyncReconstructWallFluxX" form bash print, we use the time as the indicator

and timer is implemented by std::choron, see https://github.com/XFluids/XFluids/blob/master/src/solver_Reconstruction/FDM_Method/ConVenction_block.hpp line 219, a timer spot is created, and then we submit the Kernel parallelism and wait to sync, once the wait is done, then get the duration time between the time previous spot and current.

for the second:

We had never tried before, we will try later..

Jiuxiaoyunhai added the discussion General discussion about something label Mar 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

More GPU Performance Optimization #1399

More GPU Performance Optimization #1399

Jiuxiaoyunhai commented Mar 12, 2024 •

edited by fodinabor

illuhad commented Mar 12, 2024

Jiuxiaoyunhai commented Mar 13, 2024

More GPU Performance Optimization #1399

More GPU Performance Optimization #1399

Comments

Jiuxiaoyunhai commented Mar 12, 2024 • edited by fodinabor

A list of GPUs has been tested by the following Kernel, it's part of our opensource CFD framwork XFLUIDS:

llvm-16.0.6 is implemented on A100, amd Rocm-5.4.1 on RX6800XT, without competitive performance over CUDA/HIP model, we are now using multi-pass compile system

the Performance listed here:

Kernel Code listed here:

illuhad commented Mar 12, 2024

Jiuxiaoyunhai commented Mar 13, 2024

for the first: https://github.com/XFluids/XFluids/blob/master/src/solver_Reconstruction/FDM_Method/Reconstruction_kernels.hpp, line 8,

for the seond:

for the second:

Jiuxiaoyunhai commented Mar 12, 2024 •

edited by fodinabor