Open
Description
A list of GPUs has been tested by the following Kernel, it's part of our opensource CFD framwork XFLUIDS:
llvm-16.0.6 is implemented on A100, amd Rocm-5.4.1 on RX6800XT, without competitive performance over CUDA/HIP model, we are now using multi-pass compile system
the Performance listed here:
Kernel Code listed here:
#define Emax 5
#define NUM_COP 0
#define _DF(a) a
using real_t = double;
#include <sycl.hpp>
// // AdaptiveCpp HIP Target
#ifdef __HIPSYCL_ENABLE_HIP_TARGET__
#define __LBMt 256 // <=256 for HIP
using vendorError_t = hipError_t;
using vendorDeviceProp = hipDeviceProp_t;
using vendorFuncAttributes = hipFuncAttributes;
#define _VENDOR_KERNEL_ __global__ __launch_bounds__(256, 1)
#define vendorSuccess hipSuccess;
#define vendorSetDevice(A) hipSetDevice(A)
#define vendorGetLastError() hipGetLastError()
#define vendorDeviceSynchronize() hipDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) hipFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) hipGetDeviceProperties(A, B)
#define CheckGPUErrors(call) \
{ \
hipError_t hipStatus = call; \
if (hipSuccess != hipStatus) \
{ \
fprintf(stderr, \
"ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \
"with " \
"%s (%d).\n", \
#call, __LINE__, __FILE__, hipGetErrorString(hipStatus), hipStatus); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
// // OpenSYCL CUDA Target
#elif defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
#define __LBMt 256 // <=256 for HIP
using vendorError_t = cudaError_t;
using vendorDeviceProp = cudaDeviceProp;
using vendorFuncAttributes = cudaFuncAttributes;
#define _VENDOR_KERNEL_ __global__
#define vendorSuccess cudaSuccess;
#define vendorSetDevice(A) cudaSetDevice(A)
#define vendorGetLastError() cudaGetLastError()
#define vendorDeviceSynchronize() cudaDeviceSynchronize()
#define vendorFuncGetAttributes(A, B) cudaFuncGetAttributes(A, B)
#define vendorGetDeviceProperties(A, B) cudaGetDeviceProperties(A, B)
#define CheckGPUErrors(call) \
{ \
cudaError_t cudaStatus = call; \
if (cudaSuccess != cudaStatus) \
{ \
fprintf(stderr, \
"ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \
"with " \
"%s (%d).\n", \
#call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#else // defined(__HIPSYCL_ENABLE_HIP_TARGET__) || (__HIPSYCL_ENABLE_CUDA_TARGET__)
#define SYCL_KERNEL __host__ __device__
#define SYCL_DEVICE __host__ __device__
#define _VENDOR_KERNEL_LB_(A, B) __global__ __launch_bounds__(A, B)
#endif // end Marco
typedef struct
{
int Xmax, Ymax, Zmax;
int X_inner, Y_inner, Z_inner;
int Bwidth_X, Bwidth_Y, Bwidth_Z;
} MeshSize;
SYCL_DEVICE inline real_t weno5old_BODY(const real_t v1, const real_t v2, const real_t v3, const real_t v4, const real_t v5)
{
real_t a1, a2, a3;
real_t dtwo = _DF(2.0), dtre = _DF(3.0);
a1 = v1 - dtwo * v2 + v3;
real_t s1 = _DF(13.0) * a1 * a1;
a1 = v1 - _DF(4.0) * v2 + dtre * v3;
s1 += dtre * a1 * a1;
a1 = v2 - dtwo * v3 + v4;
real_t s2 = _DF(13.0) * a1 * a1;
a1 = v2 - v4;
s2 += dtre * a1 * a1;
a1 = v3 - dtwo * v4 + v5;
real_t s3 = _DF(13.0) * a1 * a1;
a1 = dtre * v3 - _DF(4.0) * v4 + v5;
s3 += dtre * a1 * a1;
real_t tol = _DF(1.0E-6);
s1 += tol, s2 += tol, s3 += tol;
// // for double precision
a1 = _DF(0.1) * s2 * s2 * s3 * s3;
a2 = _DF(0.2) * s1 * s1 * s3 * s3;
a3 = _DF(0.3) * s1 * s1 * s2 * s2;
real_t tw1 = _DF(1.0) / (a1 + a2 + a3);
a1 = a1 * tw1, a2 = a2 * tw1, a3 = a3 * tw1;
s1 = a1 * (dtwo * v1 - _DF(7.0) * v2 + _DF(11.0) * v3);
s2 = a2 * (-v2 + _DF(5.0) * v3 + dtwo * v4);
s3 = a3 * (dtwo * v3 + _DF(5.0) * v4 - v5);
return (s1 + s2 + s3);
}
SYCL_DEVICE inline real_t weno5old_GPU(real_t *f, real_t *m)
{
int k;
real_t v1, v2, v3, v4, v5, temf, temm;
// ff
k = 0;
v1 = *(f + k - 2);
v2 = *(f + k - 1);
v3 = *(f + k);
v4 = *(f + k + 1);
v5 = *(f + k + 2);
temf = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;
// mm
k = 1;
v1 = *(m + k + 2);
v2 = *(m + k + 1);
v3 = *(m + k);
v4 = *(m + k - 1);
v5 = *(m + k - 2);
temm = weno5old_BODY(v1, v2, v3, v4, v5); //* sum;
return (temf + temm) * _six;
}
SYCL_DEVICE inline void RoeAverageLeft_x(int const n, real_t *eigen_l, real_t &eigen_value, real_t *z, const real_t *yi, real_t const c2,
real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
real_t const b1, real_t const b3, real_t Gamma)
{
real_t q2 = _u * _u + _v * _v + _w * _w;
real_t _c = sycl::sqrt(c2);
real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
real_t _c1 = _DF(1.0) / _c;
real_t _u_c = _u * _c1;
if (0 == n)
{
eigen_l[0] = _DF(0.5) * (b2 + _u_c + b3);
eigen_l[1] = -_DF(0.5) * (b1 * _u + _c1);
eigen_l[2] = -_DF(0.5) * (b1 * _v);
eigen_l[3] = -_DF(0.5) * (b1 * _w);
eigen_l[4] = _DF(0.5) * b1;
eigen_value = sycl::fabs(_u - _c);
for (int m = 0; m < NUM_COP; m++)
eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
}
else if (1 == n)
{
eigen_l[0] = (_DF(1.0) - b2 - b3) / b1;
eigen_l[1] = _u;
eigen_l[2] = _v;
eigen_l[3] = _w;
eigen_l[4] = -_DF(1.0);
eigen_value = sycl::fabs(_u);
for (int m = 0; m < NUM_COP; m++)
eigen_l[m + Emax - NUM_COP] = z[m];
}
else if (2 == n)
{
eigen_l[0] = _v;
eigen_l[1] = _DF(0.0);
eigen_l[2] = -_DF(1.0);
eigen_l[3] = _DF(0.0);
eigen_l[4] = _DF(0.0);
eigen_value = sycl::fabs(_u);
for (int m = 0; m < NUM_COP; m++)
eigen_l[m + Emax - NUM_COP] = _DF(0.0);
}
else if (3 == n)
{
eigen_l[0] = -_w;
eigen_l[1] = _DF(0.0);
eigen_l[2] = _DF(0.0);
eigen_l[3] = _DF(1.0);
eigen_l[4] = _DF(0.0);
eigen_value = sycl::fabs(_u);
for (int m = 0; m < NUM_COP; m++)
eigen_l[m + Emax - NUM_COP] = _DF(0.0);
}
else if (Emax - 1 == n)
{
eigen_l[0] = _DF(0.5) * (b2 - _u_c + b3);
eigen_l[1] = _DF(0.5) * (-b1 * _u + _c1);
eigen_l[2] = _DF(0.5) * (-b1 * _v);
eigen_l[3] = _DF(0.5) * (-b1 * _w);
eigen_l[4] = _DF(0.5) * b1;
eigen_value = sycl::fabs(_u + _c);
for (int m = 0; m < NUM_COP; m++)
eigen_l[m + Emax - NUM_COP] = -_DF(0.5) * b1 * z[m];
}
}
SYCL_DEVICE inline void RoeAverageRight_x(int const n, real_t *eigen_r, real_t *z, const real_t *yi, real_t const c2,
real_t const _rho, real_t const _u, real_t const _v, real_t const _w, real_t const _H,
real_t const b1, real_t const b3, real_t Gamma)
{
real_t q2 = _u * _u + _v * _v + _w * _w;
real_t _c = sycl::sqrt(c2);
real_t b2 = _DF(1.0) + b1 * q2 - b1 * _H;
real_t _c1 = _DF(1.0) / _c;
real_t _u_c = _u * _c1;
if (0 == n)
{
eigen_r[0] = _DF(1.0);
eigen_r[1] = _u - _c;
eigen_r[2] = _v;
eigen_r[3] = _w;
eigen_r[4] = _H - _u * _c;
for (int m = 0; m < NUM_COP; m++)
eigen_r[m + Emax - NUM_COP] = yi[m];
}
else if (1 == n)
{
eigen_r[0] = b1;
eigen_r[1] = _u * b1;
eigen_r[2] = _v * b1;
eigen_r[3] = _w * b1;
eigen_r[4] = _H * b1 - _DF(1.0);
for (int m = 0; m < NUM_COP; m++)
eigen_r[m + Emax - NUM_COP] = b1 * yi[m];
}
else if (2 == n)
{
eigen_r[0] = _DF(0.0);
eigen_r[1] = _DF(0.0);
eigen_r[2] = -_DF(1.0);
eigen_r[3] = _DF(0.0);
eigen_r[4] = -_v;
for (int m = 0; m < NUM_COP; m++)
eigen_r[m + Emax - NUM_COP] = _DF(0.0);
}
else if (3 == n)
{
eigen_r[0] = _DF(0.0);
eigen_r[1] = _DF(0.0);
eigen_r[2] = _DF(0.0);
eigen_r[3] = _DF(1.0);
eigen_r[4] = _w;
for (int m = 0; m < NUM_COP; m++)
eigen_r[m + Emax - NUM_COP] = _DF(0.0);
}
else if (Emax - 1 == n)
{
eigen_r[0] = _DF(1.0);
eigen_r[1] = _u + _c;
eigen_r[2] = _v;
eigen_r[3] = _w;
eigen_r[4] = _H + _u * _c;
for (int m = 0; m < NUM_COP; m++)
eigen_r[m + Emax - NUM_COP] = yi[m];
}
}
extern SYCL_KERNEL void ReconstructFluxX(int i, int j, int k, real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
int Xmax = bl.Xmax;
int Ymax = bl.Ymax;
int Zmax = bl.Zmax;
int X_inner = bl.X_inner;
int Y_inner = bl.Y_inner;
int Z_inner = bl.Z_inner;
int Bwidth_X = bl.Bwidth_X;
int Bwidth_Y = bl.Bwidth_Y;
int Bwidth_Z = bl.Bwidth_Z;
if (i >= X_inner + Bwidth_X)
return;
if (j >= Y_inner + Bwidth_Y)
return;
if (k >= Z_inner + Bwidth_Z)
return;
size_t id_l = Xmax * Ymax * k + Xmax * j + i;
size_t id_r = Xmax * Ymax * k + Xmax * j + i + 1;
// preparing some interval value for roe average
real_t D = sycl::sqrt(rho[id_r] / rho[id_l]);
real_t D1 = _DF(1.0) / (D + _DF(1.0));
real_t _u = (u[id_l] + D * u[id_r]) * D1;
real_t _v = (v[id_l] + D * v[id_r]) * D1;
real_t _w = (w[id_l] + D * w[id_r]) * D1;
real_t _H = (H[id_l] + D * H[id_r]) * D1;
real_t _P = (p[id_l] + D * p[id_r]) * D1;
real_t _rho = sycl::sqrt(rho[id_r] * rho[id_l]);
real_t _yi[NUM_SPECIES] = {_DF(1.0)}, b3 = _DF(0.0), z[] = {_DF(0.0)};
real_t Gamma0 = 1.4;
real_t c2 = Gamma0 * _P / _rho; /*(_H - _DF(0.5) * (_u * _u + _v * _v + _w * _w));*/
real_t b1 = (Gamma0 - _DF(1.0)) / c2;
// // construct the right value & the left value scalar equations by characteristic reduction
// // at i+1/2 in x direction
real_t uf[10], ff[10], pp[10], mm[10], f_flux, _p[Emax][Emax], eigen_lr[Emax], eigen_value, artificial_viscosity;
for (int n = 0; n < Emax; n++)
{
real_t eigen_local_max = _DF(0.0);
RoeAverageLeft_x(n, eigen_lr, eigen_value, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
for (int m = -stencil_P; m < stencil_size - stencil_P; m++)
{
int _i_1 = i + m, _j_1 = j, _k_1 = k; /* Xmax * Ymax * k + Xmax * j + i + m*/
int id_local_1 = Xmax * Ymax * (_k_1) + Xmax * (_j_1) + (_i_1);
eigen_local_max = sycl::max(eigen_local_max, sycl::fabs(eigen_local[Emax * id_local_1 + n])); /* local lax-friedrichs*/
}
artificial_viscosity = eigen_local_max;
for (int m = -3; m <= 4; m++)
{
int _i_2 = i + m, _j_2 = j, _k_2 = k; /* Xmax * Ymax * k + Xmax * j + m + i; */
int id_local = Xmax * Ymax * (_k_2) + Xmax * (_j_2) + (_i_2);
uf[m + 3] = _DF(0.0);
ff[m + 3] = _DF(0.0);
for (int n1 = 0; n1 < Emax; n1++)
{
uf[m + 3] = uf[m + 3] + UI[Emax * id_local + n1] * eigen_lr[n1]; /* eigen_l actually */
ff[m + 3] = ff[m + 3] + Fl[Emax * id_local + n1] * eigen_lr[n1];
} /* for local speed*/
pp[m + 3] = _DF(0.5) * (ff[m + 3] + artificial_viscosity * uf[m + 3]);
mm[m + 3] = _DF(0.5) * (ff[m + 3] - artificial_viscosity * uf[m + 3]);
} /* calculate the scalar numerical flux at x direction*/
f_flux = weno5old_GPU(&pp[3], &mm[3]);
RoeAverageRight_x(n, eigen_lr, z, _yi, c2, _rho, _u, _v, _w, _H, b1, b3, Gamma0);
for (int n1 = 0; n1 < Emax; n1++)
{ /* get Fp */
_p[n][n1] = f_flux * eigen_lr[n1]; /* eigen_r actually */
}
}
for (int n = 0; n < Emax; n++)
{ /* reconstruction the F-flux terms*/
real_t fluxl = _DF(0.0);
for (int n1 = 0; n1 < Emax; n1++)
{
fluxl += _p[n1][n];
}
Fwall[Emax * id_l + n] = fluxl;
}
}
#if __VENDOR_SUBMIT__
_VENDOR_KERNEL_LB_(__LBMt, 1)
void ReconstructFluxXVendorWrapper(real_t const dl, MeshSize bl, real_t *UI, real_t *Fl, real_t *Fwall, real_t *eigen_local,
real_t *p, real_t *rho, real_t *u, real_t *v, real_t *w, real_t *y, real_t *T, real_t *H, real_t *eigen_block)
{
int i = blockIdx.x * blockDim.x + threadIdx.x + bl.Bwidth_X - 1;
int j = blockIdx.y * blockDim.y + threadIdx.y + bl.Bwidth_Y;
int k = blockIdx.z * blockDim.z + threadIdx.z + bl.Bwidth_Z;
ReconstructFluxX(i, j, k, dl, bl, UI, Fl, Fwall, eigen_local, eigen_lt, eigen_rt, p, rho, u, v, w, y, T, H, eigen_block);
}
#endif
// test Parallelism submission:
#if __VENDOR_SUBMIT__
ReconstructFluxXVendorWrapper<<<temfwx.global_gd(global_ndrange_x), temfwx.local_blk>>>(bl.dx, ms, UI, FluxF, FluxFw,
eigen_local_x, p, rho, u, v, w,
fdata.y, T, H, eigen_block_x);
#else
q.submit([&](sycl::handler &h) { //
h.parallel_for(sycl::nd_range<3>(temfwx.global_nd(global_ndrange_x), temfwx.local_nd), [=](sycl::nd_item<3> index) { //
int i = index.get_global_id(0) + ms.Bwidth_X - 1;
int j = index.get_global_id(1) + ms.Bwidth_Y;
int k = index.get_global_id(2) + ms.Bwidth_Z;
ReconstructFluxX(i, j, k, bl.dx, ms, UI, FluxF, FluxFw, eigen_local_x, p, rho, u, v, w, fdata.y, T, H, eigen_block_x);
});
});
#endif