Skip to content

More GPU Performance Optimization  #1399

Open
@JinlongLi2001

Description

@JinlongLi2001

A list of GPUs has been tested by the following Kernel, it's part of our opensource CFD framwork XFLUIDS:

llvm-16.0.6 is implemented on A100, amd Rocm-5.4.1 on RX6800XT, without competitive performance over CUDA/HIP model, we are now using multi-pass compile system

the Performance listed here:

图片1

Kernel Code listed here:

#define Emax 5
#define NUM_COP 0
#define _DF(a) a
using real_t = double;

#include <sycl.hpp>

// // AdaptiveCpp HIP Target
#ifdef __HIPSYCL_ENABLE_HIP_TARGET__
#define __LBMt 256 // <=256 for HIP
using vendorError_t = hipError_t;
using vendorDeviceProp = hipDeviceProp_t;
using vendorFuncAttributes = hipFuncAttributes;

#define _VENDOR_KERNEL_ __global__ __launch_bounds__(256, 1)

#define vendorSuccess hipSuccess;
#define vendorSetDevice(A) hipSetDevice(A)
#define vendorGetLastError() hipGetLastError()
#define vendorDeviceSynchronize() hipDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) hipFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) hipGetDeviceProperties(A, B)

#define CheckGPUErrors(call)                                                             \
	{                                                                                    \
		hipError_t hipStatus = call;                                                     \
		if (hipSuccess != hipStatus)                                                     \
		{                                                                                \
			fprintf(stderr,                                                              \
					"ERROR: CUDA RT call \"%s\" in line %d of file %s failed "           \
					"with "                                                              \
					"%s (%d).\n",                                                        \
					#call, __LINE__, __FILE__, hipGetErrorString(hipStatus), hipStatus); \
			exit(EXIT_FAILURE);                                                          \
		}                                                                                \
	}                                                                                    \
	while (0)

// // OpenSYCL CUDA Target
#elif defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
#define __LBMt 256 // <=256 for HIP
using vendorError_t = cudaError_t;
using vendorDeviceProp = cudaDeviceProp;
using vendorFuncAttributes = cudaFuncAttributes;

#define _VENDOR_KERNEL_ __global__

#define vendorSuccess cudaSuccess;
#define vendorSetDevice(A) cudaSetDevice(A)
#define vendorGetLastError() cudaGetLastError()
#define vendorDeviceSynchronize() cudaDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) cudaFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) cudaGetDeviceProperties(A, B)

#define CheckGPUErrors(call)                                                                \
	{                                                                                       \
		cudaError_t cudaStatus = call;                                                      \
		if (cudaSuccess != cudaStatus)                                                      \
		{                                                                                   \
			fprintf(stderr,                                                                 \
					"ERROR: CUDA RT call \"%s\" in line %d of file %s failed "              \
					"with "                                                                 \
					"%s (%d).\n",                                                           \
					#call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
			exit(EXIT_FAILURE);                                                             \
		}                                                                                   \
	}                                                                                       \
	while (0)

#else // defined(__HIPSYCL_ENABLE_HIP_TARGET__) || (__HIPSYCL_ENABLE_CUDA_TARGET__)

#define SYCL_KERNEL __host__ __device__
#define SYCL_DEVICE __host__ __device__
#define _VENDOR_KERNEL_LB_(A, B) __global__ __launch_bounds__(A, B)

#endif // end Marco

typedef struct
{
	int Xmax, Ymax, Zmax;
	int X_inner, Y_inner, Z_inner;
	int Bwidth_X, Bwidth_Y, Bwidth_Z;
} MeshSize;

SYCL_DEVICE inline real_t weno5old_BODY(const real_t v1, const real_t v2, const real_t v3, const real_t v4, const real_t v5)
{
	real_t a1, a2, a3;
	real_t dtwo = _DF(2.0), dtre = _DF(3.0);

	a1 = v1 - dtwo * v2 + v3;
	real_t s1 = _DF(13.0) * a1 * a1;
	a1 = v1 - _DF(4.0) * v2 + dtre * v3;
	s1 += dtre * a1 * a1;
	a1 = v2 - dtwo * v3 + v4;
	real_t s2 = _DF(13.0) * a1 * a1;
	a1 = v2 - v4;
	s2 += dtre * a1 * a1;
	a1 = v3 - dtwo * v4 + v5;
	real_t s3 = _DF(13.0) * a1 * a1;
	a1 = dtre * v3 - _DF(4.0) * v4 + v5;
	s3 += dtre * a1 * a1;

	real_t tol = _DF(1.0E-6);
	s1 += tol, s2 += tol, s3 += tol;

	// // for double precision
	a1 = _DF(0.1) * s2 * s2 * s3 * s3;
	a2 = _DF(0.2) * s1 * s1 * s3 * s3;
	a3 = _DF(0.3) * s1 * s1 * s2 * s2;

	real_t tw1 = _DF(1.0) / (a1 + a2 + a3);
	a1 = a1 * tw1, a2 = a2 * tw1, a3 = a3 * tw1;

	s1 = a1 * (dtwo * v1 - _DF(7.0) * v2 + _DF(11.0) * v3);
	s2 = a2 * (-v2 + _DF(5.0) * v3 + dtwo * v4);
	s3 = a3 * (dtwo * v3 + _DF(5.0) * v4 - v5);

	return (s1 + s2 + s3);
}

SYCL_DEVICE inline real_t weno5old_GPU(real_t *f, real_t *m)
{
	int k;
	real_t v1, v2, v3, v4, v5, temf, temm;
	// ff
	k = 0;
	v1 = *(f + k - 2);
	v2 = *(f + k - 1);
	v3 = *(f + k);
	v4 = *(f + k + 1);
	v5 = *(f + k + 2);

	temf = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;
	// mm
	k = 1;
	v1 = *(m + k + 2);
	v2 = *(m + k + 1);
	v3 = *(m + k);
	v4 = *(m + k - 1);
	v5 = *(m + k - 2);

	temm = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;

	return (temf + temm) * _six;
}

SYCL_DEVICE inline void RoeAverageLeft_x(int const n, real_t *eigen_l, real_t &eigen_value, real_t *z, const real_t *yi, real_t const c2,
										 real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
										 real_t const b1, real_t const b3, real_t Gamma)
{
	real_t q2 = _u * _u + _v * _v + _w * _w;
	real_t _c = sycl::sqrt(c2);
	real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
	real_t _c1 = _DF(1.0) / _c;
	real_t _u_c = _u * _c1;

	if (0 == n)
	{
		eigen_l[0] = _DF(0.5) * (b2 + _u_c + b3);
		eigen_l[1] = -_DF(0.5) * (b1 * _u + _c1);
		eigen_l[2] = -_DF(0.5) * (b1 * _v);
		eigen_l[3] = -_DF(0.5) * (b1 * _w);
		eigen_l[4] = _DF(0.5) * b1;
		eigen_value = sycl::fabs(_u - _c);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
	}
	else if (1 == n)
	{
		eigen_l[0] = (_DF(1.0) - b2 - b3) / b1;
		eigen_l[1] = _u;
		eigen_l[2] = _v;
		eigen_l[3] = _w;
		eigen_l[4] = -_DF(1.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = z[m];
	}
	else if (2 == n)
	{
		eigen_l[0] = _v;
		eigen_l[1] = _DF(0.0);
		eigen_l[2] = -_DF(1.0);
		eigen_l[3] = _DF(0.0);
		eigen_l[4] = _DF(0.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (3 == n)
	{
		eigen_l[0] = -_w;
		eigen_l[1] = _DF(0.0);
		eigen_l[2] = _DF(0.0);
		eigen_l[3] = _DF(1.0);
		eigen_l[4] = _DF(0.0);
		eigen_value = sycl::fabs(_u);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (Emax - 1 == n)
	{
		eigen_l[0] = _DF(0.5) * (b2 - _u_c + b3);
		eigen_l[1] = _DF(0.5) * (-b1 * _u + _c1);
		eigen_l[2] = _DF(0.5) * (-b1 * _v);
		eigen_l[3] = _DF(0.5) * (-b1 * _w);
		eigen_l[4] = _DF(0.5) * b1;
		eigen_value = sycl::fabs(_u + _c);
		for (int m = 0; m < NUM_COP; m++)
			eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
	}
}

SYCL_DEVICE inline void RoeAverageRight_x(int const n, real_t *eigen_r, real_t *z, const real_t *yi, real_t const c2,
										  real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
										  real_t const b1, real_t const b3, real_t Gamma)
{
	real_t q2 = _u * _u + _v * _v + _w * _w;
	real_t _c = sycl::sqrt(c2);
	real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
	real_t _c1 = _DF(1.0) / _c;
	real_t _u_c = _u * _c1;

	if (0 == n)
	{
		eigen_r[0] = _DF(1.0);
		eigen_r[1] = _u - _c;
		eigen_r[2] = _v;
		eigen_r[3] = _w;
		eigen_r[4] = _H - _u * _c;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = yi[m];
	}
	else if (1 == n)
	{
		eigen_r[0] = b1;
		eigen_r[1] = _u * b1;
		eigen_r[2] = _v * b1;
		eigen_r[3] = _w * b1;
		eigen_r[4] = _H * b1 - _DF(1.0);
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = b1 * yi[m];
	}
	else if (2 == n)
	{
		eigen_r[0] = _DF(0.0);
		eigen_r[1] = _DF(0.0);
		eigen_r[2] = -_DF(1.0);
		eigen_r[3] = _DF(0.0);
		eigen_r[4] = -_v;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (3 == n)
	{
		eigen_r[0] = _DF(0.0);
		eigen_r[1] = _DF(0.0);
		eigen_r[2] = _DF(0.0);
		eigen_r[3] = _DF(1.0);
		eigen_r[4] = _w;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = _DF(0.0);
	}
	else if (Emax - 1 == n)
	{
		eigen_r[0] = _DF(1.0);
		eigen_r[1] = _u + _c;
		eigen_r[2] = _v;
		eigen_r[3] = _w;
		eigen_r[4] = _H + _u * _c;
		for (int m = 0; m < NUM_COP; m++)
			eigen_r[m + Emax - NUM_COP] = yi[m];
	}
}

extern SYCL_KERNEL void ReconstructFluxX(int i, int j, int k, real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
										 real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
	int Xmax = bl.Xmax;
	int Ymax = bl.Ymax;
	int Zmax = bl.Zmax;
	int X_inner = bl.X_inner;
	int Y_inner = bl.Y_inner;
	int Z_inner = bl.Z_inner;
	int Bwidth_X = bl.Bwidth_X;
	int Bwidth_Y = bl.Bwidth_Y;
	int Bwidth_Z = bl.Bwidth_Z;
	if (i >= X_inner + Bwidth_X)
		return;
	if (j >= Y_inner + Bwidth_Y)
		return;
	if (k >= Z_inner + Bwidth_Z)
		return;

	size_t id_l = Xmax * Ymax * k + Xmax * j + i;
	size_t id_r = Xmax * Ymax * k + Xmax * j + i + 1;

	// preparing some interval value for roe average
	real_t D = sycl::sqrt(rho[id_r] / rho[id_l]);
	real_t D1 = _DF(1.0) / (D + _DF(1.0));
	real_t _u = (u[id_l] + D * u[id_r]) * D1;
	real_t _v = (v[id_l] + D * v[id_r]) * D1;
	real_t _w = (w[id_l] + D * w[id_r]) * D1;
	real_t _H = (H[id_l] + D * H[id_r]) * D1;
	real_t _P = (p[id_l] + D * p[id_r]) * D1;
	real_t _rho = sycl::sqrt(rho[id_r] * rho[id_l]);
	real_t _yi[NUM_SPECIES] = {_DF(1.0)}, b3 = _DF(0.0), z[] = {_DF(0.0)};
	real_t Gamma0 = 1.4;
	real_t c2 = Gamma0 * _P / _rho; /*(_H - _DF(0.5) * (_u * _u + _v * _v + _w * _w));*/
	real_t b1 = (Gamma0 - _DF(1.0)) / c2;
	//     // construct the right value & the left value scalar equations by characteristic reduction
	//     // at i+1/2 in x direction
	real_t uf[10], ff[10], pp[10], mm[10], f_flux, _p[Emax][Emax], eigen_lr[Emax], eigen_value, artificial_viscosity;
	for (int n = 0; n < Emax; n++)
	{
		real_t eigen_local_max = _DF(0.0);
		RoeAverageLeft_x(n, eigen_lr, eigen_value, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
		for (int m = -stencil_P; m < stencil_size - stencil_P; m++)
		{
			int _i_1 = i + m, _j_1 = j, _k_1 = k; /* Xmax * Ymax * k + Xmax * j + i + m*/
			int id_local_1 = Xmax * Ymax * (_k_1) + Xmax * (_j_1) + (_i_1);
			eigen_local_max = sycl::max(eigen_local_max, sycl::fabs(eigen_local[Emax * id_local_1 + n])); /* local lax-friedrichs*/
		}
		artificial_viscosity = eigen_local_max;
		for (int m = -3; m <= 4; m++)
		{
			int _i_2 = i + m, _j_2 = j, _k_2 = k; /* Xmax * Ymax * k + Xmax * j + m + i; */
			int id_local = Xmax * Ymax * (_k_2) + Xmax * (_j_2) + (_i_2);
			uf[m + 3] = _DF(0.0);
			ff[m + 3] = _DF(0.0);
			for (int n1 = 0; n1 < Emax; n1++)
			{
				uf[m + 3] = uf[m + 3] + UI[Emax * id_local + n1] * eigen_lr[n1]; /* eigen_l actually */
				ff[m + 3] = ff[m + 3] + Fl[Emax * id_local + n1] * eigen_lr[n1];
			} /*  for local speed*/
			pp[m + 3] = _DF(0.5) * (ff[m + 3] + artificial_viscosity * uf[m + 3]);
			mm[m + 3] = _DF(0.5) * (ff[m + 3] - artificial_viscosity * uf[m + 3]);
		} /* calculate the scalar numerical flux at x direction*/
		f_flux = weno5old_GPU(&pp[3], &mm[3]);
		RoeAverageRight_x(n, eigen_lr, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
		for (int n1 = 0; n1 < Emax; n1++)
		{									   /* get Fp */
			_p[n][n1] = f_flux * eigen_lr[n1]; /* eigen_r actually */
		}
	}
	for (int n = 0; n < Emax; n++)
	{ /* reconstruction the F-flux terms*/
		real_t fluxl = _DF(0.0);
		for (int n1 = 0; n1 < Emax; n1++)
		{
			fluxl += _p[n1][n];
		}
		Fwall[Emax * id_l + n] = fluxl;
	}
}

#if __VENDOR_SUBMIT__
_VENDOR_KERNEL_LB_(__LBMt, 1)
void ReconstructFluxXVendorWrapper(real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
								   real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
	int i = blockIdx.x * blockDim.x + threadIdx.x + bl.Bwidth_X - 1;
	int j = blockIdx.y * blockDim.y + threadIdx.y + bl.Bwidth_Y;
	int k = blockIdx.z * blockDim.z + threadIdx.z + bl.Bwidth_Z;

	ReconstructFluxX(i, j, k, dl, bl, UI, Fl, Fwall, eigen_local, eigen_lt, eigen_rt, p, rho, u, v, w, y, T, H, eigen_block);
}
#endif

// test Parallelism submission:
#if __VENDOR_SUBMIT__
ReconstructFluxXVendorWrapper<<<temfwx.global_gd(global_ndrange_x), temfwx.local_blk>>>(bl.dx, ms, UI, FluxF, FluxFw,
																						eigen_local_x, p, rho, u, v, w,
																						fdata.y, T, H, eigen_block_x);
#else
q.submit([&](sycl::handler &h) {																						 //
	h.parallel_for(sycl::nd_range<3>(temfwx.global_nd(global_ndrange_x), temfwx.local_nd), [=](sycl::nd_item<3> index) { //
		int i = index.get_global_id(0) + ms.Bwidth_X - 1;
		int j = index.get_global_id(1) + ms.Bwidth_Y;
		int k = index.get_global_id(2) + ms.Bwidth_Z;
		ReconstructFluxX(i, j, k, bl.dx, ms, UI, FluxF, FluxFw, eigen_local_x, p, rho, u, v, w, fdata.y, T, H, eigen_block_x);
	});
});
#endif

Metadata

Metadata

Assignees

No one assigned

    Labels

    discussionGeneral discussion about something

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions