From 6c243ff6c2786557d0d7a5f63f513587d8446c24 Mon Sep 17 00:00:00 2001 From: Peter Hjort Lauritzen Date: Mon, 18 Aug 2025 08:06:07 -0600 Subject: [PATCH 1/9] performance improvement from John Dennis (CISL) --- .../se/dycore/fvm_reconstruction_mod.F90 | 167 ++++++++++-------- 1 file changed, 89 insertions(+), 78 deletions(-) diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 index b4708dfd3b..fad8f41cb5 100644 --- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 +++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 @@ -151,7 +151,7 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& end select end if call slope_limiter(nhe,nc,nhc,fcube(:,:,k_in,itr),jx,jy,irecons,recons(:,:,:,itr),& - spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),& + spherecentroid,& recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual) end if end do @@ -356,6 +356,7 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re real (kind=r8), dimension(-1:1) :: minval_array, maxval_array real (kind=r8), parameter :: threshold = 1.0E-40_r8 character(len=128) :: errormsg + select case (irecons_actual) ! ! PLM limiter @@ -564,6 +565,7 @@ end subroutine slope_limiter ! recons ... array of reconstructed coefficients ! ! OUTPUT: value ... evaluation at a given point ! !-----------------------------------------------------------------------------------! + !DIR$ ATTRIBUTES FORCEINLINE :: recons_val_cart subroutine recons_val_cart(fcube, cartx, carty, centroid, pre_computed_metrics, recons, value) implicit none real(kind=r8), intent(in) :: fcube @@ -586,6 +588,7 @@ subroutine recons_val_cart(fcube, cartx, carty, centroid, pre_computed_metrics, recons(6) * (pre_computed_metrics(3) + dx*dy) END subroutine recons_val_cart + !DIR$ ATTRIBUTES FORCEINLINE :: recons_val_cart_plm subroutine recons_val_cart_plm(fcube, cartx, carty, centroid, recons, value) implicit none real(kind=r8), intent(in) :: fcube @@ -640,43 +643,62 @@ subroutine slopelimiter_val(value, cell_value, local_min, local_max, min_phi) end subroutine slopelimiter_val !END SUBROUTINE SLOPELIMITER_VAL------------------------------------------CE-for FVM! - function matmul_w(w,f,ns) + !DIR$ ATTRIBUTES FORCEINLINE :: dotproduct + function dotproduct(w,f,ns) + implicit none + real (kind=r8) :: dotproduct + real (kind=r8),dimension(:), intent(in) :: w,f !dimension(ns) + integer, intent(in) :: ns + integer :: k + + if(ns==3) then + dotproduct = DotProduct_3(w,f) + else + dotproduct = DotProduct_gen(w,f,ns) + endif + + end function dotproduct + + !DIR$ ATTRIBUTES FORCEINLINE :: DotProduct_gen + function DotProduct_gen(w,f,ns) implicit none - real (kind=r8) :: matmul_w + real (kind=r8) :: DotProduct_gen real (kind=r8),dimension(:), intent(in) :: w,f !dimension(ns) integer, intent(in) :: ns integer :: k - matmul_w = 0.0_r8 + DotProduct_gen = 0.0_r8 do k=1,ns - matmul_w = matmul_w+w(k)*f(k) + DotProduct_gen = DotProduct_gen+w(k)*f(k) end do - end function matmul_w + end function DotProduct_gen ! special hard-coded version of the function where ns=3 ! for performance optimization -! function matmul_w(w, f) -! IMPLICIT NONE -! REAL(KIND=r8), dimension(3), intent(in) :: w -! REAL(KIND=r8), dimension(3), intent(in) :: f -! REAL(KIND=r8) :: matmul_w -! matmul_w = w(1)*f(1) + w(2)*f(2) + w(3)*f(3) -! end function matmul_w - - subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo_interp_weight,ibase,& + !DIR$ ATTRIBUTES FORCEINLINE :: DotProduct_3 + function DotProduct_3(w, f) + IMPLICIT NONE + REAL(KIND=r8), dimension(3), intent(in) :: w + REAL(KIND=r8), dimension(3), intent(in) :: f + REAL(KIND=r8) :: DotProduct_3 + DotProduct_3 = w(1)*f(1) + w(2)*f(2) + w(3)*f(3) + end function DotProduct_3 + + subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWeight,ibase,& fpanel,fotherpanel) implicit none integer, intent(in) :: cubeboundary,nc,nhr,nht,nh,nhc,ns real (kind=r8), & dimension(1-nhc:nc+nhc, 1-nhc:nc+nhc), intent(in) :: fcube - real (kind=r8), intent(in) :: halo_interp_weight(1:ns,1-nh:nc+nh,1:nhr,2) + real (kind=r8), intent(in) :: hWeight(1:ns,1-nh:nc+nh,1:nhr,2) integer , intent(in) :: ibase(1-nh:nc+nh,1:nhr,2) real (kind=r8) , dimension(1-nht:nc+nht, 1-nht:nc+nht ), intent(out) :: fpanel real (kind=r8), dimension(1-nht:nc+nht,1-nht:nc+nht,2), intent(out), optional :: fotherpanel integer :: i, halo,ibaseref - real (kind=r8), dimension(1:ns,1-nh:nc+nh,1:nhr) :: w + + real (kind=r8), dimension(1-nhc:nc+nhc) :: ftmp ! ! fpanel = 1.0E19 !dbg ! @@ -783,12 +805,11 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! fill in values that are on the west panels projection ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(1-halo,:) ! copy to a temporary do i=halo-nh,nc+nh-(halo-1) ibaseref=ibase(i,halo,1) - ! ibaseref = ibase(i,halo,1) - fpanel(1-halo ,i) = matmul_w(w(:,i,halo),fcube(1-halo ,ibaseref:ibaseref+ns-1),ns) + fpanel(1-halo ,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do @@ -799,12 +820,13 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo fotherpanel (1-nht:0,1-nht:nc+nht,1)=fcube(1-nht:0,1-nht:nc+nht) ! do halo=1,nhr + ftmp(:) = fcube(halo,:) ! copy to a temporary do i=halo-nh,nc+nh-(halo-1) ibaseref=ibase(i,halo,1) ! ! Exploit symmetry in interpolation weights ! - fotherpanel(halo,i,1) = matmul_w(w(:,i,halo),fcube(halo ,ibaseref:ibaseref+ns-1),ns) + fotherpanel(halo,i,1) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do end if @@ -851,21 +873,21 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! -3 |-2 |-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 ! fpanel (1-nht:nc ,1-nht:nc+nht )=fcube(1-nht:nc ,1-nht:nc+nht) - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(nc+halo,:) ! copy to a temporary do i=halo-nh,nc+nh-(halo-1) ibaseref = ibase(i,halo,1) - fpanel (nc+halo ,i ) = matmul_w(w(:,i,halo),fcube(nc +halo,ibaseref:ibaseref+ns-1),ns) + fpanel (nc+halo ,i ) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do if (present(fotherpanel)) then fotherpanel (nc+1 :nc+nht ,1-nht:nc+nht,1)=fcube(nc+1 :nc+nht ,1-nht:nc+nht) ! do halo=1,nhr + ftmp(:) = fcube(nc+1-halo,:) ! copy to a temporary do i=halo-nh,nc+nh-(halo-1) - ! ibaseref=ibase(i,halo,1 ) ibaseref = ibase(i,halo,1) - fotherpanel (nc+1-halo ,i,1) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns) + fotherpanel (nc+1-halo ,i,1) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do end if @@ -905,11 +927,10 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! fill in values that are on the same projection as "main" element fpanel (1-nht:nc+nht ,1-nht:nc)=fcube(1-nht:nc+nht ,1-nht:nc) - w = halo_interp_weight(:,:,:,1) do halo=1,nhr do i=halo-nh,nc+nh-(halo-1) ibaseref = ibase(i,halo,1) - fpanel (i,nc+halo ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+halo ),ns) !north + fpanel (i,nc+halo ) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+halo),ns) !north end do end do if (present(fotherpanel)) then @@ -919,7 +940,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo do halo=1,nhr do i=halo-nh,nc+nh-(halo-1) ibaseref = ibase(i,halo,1) - fotherpanel (i,nc+1-halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) + fotherpanel (i,nc+1-halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) end do end do end if @@ -960,19 +981,19 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! fill in values that are on the same projection as "main" element (marked with "i" in Figure above) ! fpanel (1-nht:nc+nht,1:nc+nht )=fcube(1-nht:nc+nht,1:nc+nht) - w = halo_interp_weight(:,:,:,1) do halo=1,nhr do i=halo-nh,nc+nh-(halo-1) ibaseref=ibase(i,halo,1)!ibase(i,halo,2) - fpanel (i,1-halo ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,1-halo),ns) !south + fpanel (i,1-halo ) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,1-halo),ns) !south end do end do if (present(fotherpanel)) then fotherpanel (1-nht:nc+nht,1-nht:0 ,1)=fcube(1-nht:nc+nht,1-nht:0 ) do halo=1,nhr + ftmp(:) = fcube(:,halo) do i=halo-nh,nc+nh-(halo-1) ibaseref=ibase(i,halo,1)!ibase(i,halo,2) - fotherpanel (i, halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1, halo),ns) + fotherpanel (i, halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1, halo),ns) end do end do end if @@ -1018,12 +1039,12 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! fill in west part (marked with "w" on Figure above) and south part (marked with "s") ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(1-halo,:) ! copy to a temporary do i=max(halo-nh,0),nc+nh-(halo-1) ibaseref=ibase(i,halo,1)!ibase(i,halo,1) - fpanel(1-halo ,i) = matmul_w(w(:,i,halo),fcube(1-halo ,ibaseref:ibaseref+ns-1),ns) !west - fpanel(i,1-halo ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,1-halo) ,ns) !south + fpanel(1-halo ,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) !west + fpanel(i,1-halo ) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,1-halo) ,ns) !south end do end do ! @@ -1075,20 +1096,18 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! compute interpolated cell average values in "p" cells on Figure on above ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr do i=max(halo-nh,0),nc+nh-(halo-1) ibaseref=ibase(i,halo,1) ! ! use same weights as interpolation south from main panel (symmetric) ! - fotherpanel(i,halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,halo),ns) + fotherpanel(i,halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,halo),ns) end do end do ! ! compute interpolated cell average values in "w" cells on Figure on above ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr do i=nc+halo-nhr,nc+1 ibaseref=ibase(i,halo,2)-nc @@ -1108,7 +1127,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! | | ! =============================== ! - fotherpanel(1-halo,i-nc,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,halo),ns) + fotherpanel(1-halo,i-nc,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,halo),ns) end do end do fotherpanel(0,1,1) = 0.25_r8*(fotherpanel(-1,1,1)+fotherpanel(1,1,1)+fotherpanel(0,2,1)+fotherpanel(0,0,1)) @@ -1164,21 +1183,21 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! compute interpolated cell average values in "p" cells on Figure on above ! - w = halo_interp_weight(:,:,:,1) ! symmetry do halo=1,nhr + ftmp(:) = fcube(halo,:) ! copy to a temporary do i=max(halo-nh,0),nc+nh-(halo-1) ibaseref=ibase(i,halo,1) ! ! use same weights as interpolation south from main panel (symmetric) ! - fotherpanel(halo,i,2) = matmul_w(w(:,i,halo),fcube(halo,ibaseref:ibaseref+ns-1),ns) + fotherpanel(halo,i,2) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do ! ! compute interpolated cell average values in "s" cells on Figure on above ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr + ftmp(:) = fcube(halo,:) ! copy to a temporary do i=nc+halo-nhr,nc+1 ibaseref=ibase(i,halo,2)-nc ! @@ -1197,7 +1216,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! | | ! =============================== ! - fotherpanel(i-nc,1-halo,2) = matmul_w(w(:,i,halo),fcube(halo,ibaseref:ibaseref+ns-1),ns) + fotherpanel(i-nc,1-halo,2) = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do fotherpanel(1,0,2) = 0.25_r8*(fotherpanel(0,0,2)+fotherpanel(2,0,2)+fotherpanel(1,-1,2)+fotherpanel(1,1,2)) @@ -1235,21 +1254,20 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! east ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(nc+halo,:) ! copy to a temporary do i=max(halo-nh,0),nc+nh-(halo-1) ibaseref = ibase(i,halo,1) - fpanel(nc+halo,i) = matmul_w(w(:,i,halo),fcube(nc +halo,ibaseref:ibaseref+ns-1),ns) + fpanel(nc+halo,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do ! ! south ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref = ibase(i,halo,2) - fpanel(i,1-halo ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,1-halo),ns) !south + fpanel(i,1-halo ) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,1-halo),ns) !south end do end do fpanel(nc+1,0 )=0.25_r8*(& @@ -1299,20 +1317,19 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo if (present(fotherpanel)) then fotherpanel(1-nht:nc,1-nht:0,1) = fcube(1-nht:nc,1-nht:0) ! - w = halo_interp_weight(:,:,:,2) ! ! fill in "n" on Figure above ! do halo=1,nhr + ftmp(:) = fcube(:,halo) do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref = ibase(i,halo,2) - fotherpanel (i,halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1, halo),ns) + fotherpanel (i,halo,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1, halo),ns) end do end do ! ! fill in "e" on Figure above ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr do i=0,nht-halo!nc+nh-(halo-1) ibaseref = ibase(i,halo,1) @@ -1322,7 +1339,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! use symmetry for weights (same weights as East from main panel but for south panel ! projection the indecies are rotated) ! - fotherpanel (nc+halo ,1-i,1) = matmul_w(w(:,i,halo),fcube(nc+ibaseref:nc+ibaseref+ns-1,halo),ns) + fotherpanel (nc+halo ,1-i,1) = dotproduct(hWeight(:,i,halo,1),fcube(nc+ibaseref:nc+ibaseref+ns-1,halo),ns) end do end do fotherpanel(nc+1,1,1) = 0.25_r8*(fotherpanel(nc+2,1,1)+fotherpanel(nc,1,1)& @@ -1378,18 +1395,18 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! fill in "w" on Figure above ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(nc+1-halo,:) ! copy to a temporary do i=0,nc+nh-(halo-1) ibaseref = ibase(i,halo,1) - fotherpanel(nc+1-halo,i,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns) + fotherpanel(nc+1-halo,i,2) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do ! ! fill in "s" on Figure above ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr + ftmp(:) = fcube(nc+1-halo,:) ! copy to a temporary do i=nc+1-nht+halo,nc+1 ! ! @@ -1417,7 +1434,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! fcube index: due to rotation (see Figure above) ! - fotherpanel(nc+(nc+1-i),1-halo,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns) + fotherpanel(nc+(nc+1-i),1-halo,2) = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do fotherpanel(nc,0,2) = 0.25_r8*(fotherpanel(nc+1,0,2)+fotherpanel(nc-1,0,2)& @@ -1454,21 +1471,20 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! west ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(1-halo,:) ! copy to a temporary do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref=ibase(i,halo,1) - fpanel(1-halo ,i) = matmul_w(w(:,i,halo),fcube(1-halo ,ibaseref:ibaseref+ns-1),ns) + fpanel(1-halo ,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do ! ! north ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr do i=max(halo-nh,0),nc+nh-(halo-1) ibaseref = ibase(i,halo,2) - fpanel(i,nc+halo) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+halo ),ns) !north + fpanel(i,nc+halo) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,nc+halo ),ns) !north end do end do fpanel(0 ,nc+1)=0.25_r8*(& @@ -1509,11 +1525,10 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! (use code from north above) ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr do i=max(halo-nh,0),nc+nh-(halo-1) ibaseref = ibase(i,halo,2) - fotherpanel(i,nc+1-halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo ),ns) + fotherpanel(i,nc+1-halo,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,nc+1-halo ),ns) end do end do ! @@ -1521,11 +1536,10 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! (use code from west above) ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr do i=nc+1-nht+halo,nc+1 ibaseref=ibase(i,halo,1)-nc - fotherpanel(1-halo,nc-(i-(nc+1)),1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) + fotherpanel(1-halo,nc-(i-(nc+1)),1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) end do end do fotherpanel(0,nc,1)=0.25_r8*(& @@ -1574,11 +1588,11 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! (use code from west above) ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(halo,:) ! copy to a temporary do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref=ibase(i,halo,1) - fotherpanel(halo ,i,2) = matmul_w(w(:,i,halo),fcube(halo ,ibaseref:ibaseref+ns-1),ns) + fotherpanel(halo ,i,2) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do ! @@ -1587,11 +1601,11 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! (use code from north above) ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr + ftmp(:) = fcube(halo,:) ! copy to a temporary do i=0,nht-halo ibaseref = ibase(i,halo,2)+nc - fotherpanel(1-i,nc+halo,2) = matmul_w(w(:,i,halo),fcube(halo,ibaseref:ibaseref+ns-1),ns) !north + fotherpanel(1-i,nc+halo,2) = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns) !north end do end do fotherpanel(1,nc+1,2)=0.25_r8*(& @@ -1630,21 +1644,20 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! east ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(nc+halo,:) ! copy to a temporary do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref=ibase(i,halo,1 ) - fpanel(nc+halo,i) = matmul_w(w(:,i,halo),fcube(nc +halo,ibaseref:ibaseref+ns-1),ns) + fpanel(nc+halo,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do ! ! north ! - ! w = halo_interp_weight(:,:,:,1) do halo=1,nhr do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref=ibase(i,halo,1) - fpanel(i,nc+halo) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+halo ),ns) !north + fpanel(i,nc+halo) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+halo ),ns) !north end do end do fpanel(nc+1,nc+1)=0.25_r8*(& @@ -1697,17 +1710,15 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! (use north case from above and shift/reverse j-index ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref=ibase(i,halo,1) - fotherpanel (i,nc+1-halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) + fotherpanel (i,nc+1-halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) end do end do ! ! fill in "e" on Figure above ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr do i=max(halo-nh,0),nht-halo ibaseref=ibase(i,halo,2) +nc @@ -1715,7 +1726,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! fotherpanel uses indexing of main panel's projection ! fcube: rotated indexing ! - fotherpanel (nc+halo,nc+i,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) + fotherpanel (nc+halo,nc+i,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns) end do end do fotherpanel(nc+1,nc,1)=0.25_r8*(& @@ -1774,25 +1785,25 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo ! ! (use east case from above and shift/reverse j-index ! - w = halo_interp_weight(:,:,:,1) do halo=1,nhr + ftmp(:) = fcube(nc+1-halo,:) ! copy to a temporary do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref=ibase(i,halo,1 ) - fotherpanel(nc+1-halo,i,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns) + fotherpanel(nc+1-halo,i,2) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do ! ! fill in "n" on Figure above ! - w = halo_interp_weight(:,:,:,2) do halo=1,nhr + ! ftmp(:) = fcube(nc+1-halo,:) ! copy to a temporary do i=max(halo-nh,0),nht-halo ibaseref=ibase(i,halo,2) +nc ! ! fotherpanel uses indexing of main panel's projection ! fcube: rotated indexing ! - fotherpanel (nc+i,nc+halo,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns) + fotherpanel (nc+i,nc+halo,2) = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns) end do end do fotherpanel(nc,nc+1,2)=0.25_r8*(& From 948926f154988833eaac9ada947e494dd36604b5 Mon Sep 17 00:00:00 2001 From: Peter Hjort Lauritzen Date: Mon, 18 Aug 2025 08:14:41 -0600 Subject: [PATCH 2/9] update ChangeLog --- doc/ChangeLog | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/doc/ChangeLog b/doc/ChangeLog index 2381ecdeb7..9a6a924dd1 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,39 @@ +=============================================================== + +Tag name: cam6_ +Originator(s): johnmauff, pel +Date: April 28, 2025 +One-line Summary: +Github PR URL: https://github.com/ESCOMP/CAM/pull/ + +Purpose of changes (include the issue number and title text for each relevant GitHub issue): + Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1316 + + The subroutine extend_panel_interpolate is written such that the compiler will generate more data movement than is necessary. + This excessive data movement intensifies a computational load imbalance in the CSLAM advection. While it is impossible to eliminate + the load imbalance that is caused by the special treatment of panels at the corners of the cubed sphere, it is possible to reduce + the cost of this subroutine by changing the way that the subroutine is written. + +Describe any changes made to build system: N/A + +Describe any changes made to the namelist: N/A + +List any changes to the defaults for the boundary datasets: N/A + +Describe any substantial timing or memory changes: N/A + +Code reviewed by: + +List all files eliminated: N/A + +List all files added and what they do: N/A + +List all existing files that have been modified, and describe the changes: + +.... + +Summarize any changes to answers: + No answer changes, all b4b =============================================================== From e6d5b831d8b43847375c33355f2b518558d3b8c6 Mon Sep 17 00:00:00 2001 From: Peter Hjort Lauritzen Date: Wed, 20 Aug 2025 03:35:39 -0600 Subject: [PATCH 3/9] remove unused variables --- doc/ChangeLog | 2 +- src/dynamics/se/dycore/fvm_reconstruction_mod.F90 | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/ChangeLog b/doc/ChangeLog index 9a6a924dd1..59086e3916 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -3,7 +3,7 @@ Tag name: cam6_ Originator(s): johnmauff, pel Date: April 28, 2025 -One-line Summary: +One-line Summary: Github PR URL: https://github.com/ESCOMP/CAM/pull/ Purpose of changes (include the issue number and title text for each relevant GitHub issue): diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 index fad8f41cb5..13e1aa0cf2 100644 --- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 +++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 @@ -990,7 +990,6 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWei if (present(fotherpanel)) then fotherpanel (1-nht:nc+nht,1-nht:0 ,1)=fcube(1-nht:nc+nht,1-nht:0 ) do halo=1,nhr - ftmp(:) = fcube(:,halo) do i=halo-nh,nc+nh-(halo-1) ibaseref=ibase(i,halo,1)!ibase(i,halo,2) fotherpanel (i, halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1, halo),ns) @@ -1321,7 +1320,6 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWei ! fill in "n" on Figure above ! do halo=1,nhr - ftmp(:) = fcube(:,halo) do i=halo-nh,min(nc+nh-(halo-1),nc+1) ibaseref = ibase(i,halo,2) fotherpanel (i,halo,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1, halo),ns) From 9e7393bd10c3dc9aa5fdac5a1f8281dc978160af Mon Sep 17 00:00:00 2001 From: Peter Hjort Lauritzen Date: Thu, 21 Aug 2025 07:24:18 -0600 Subject: [PATCH 4/9] fix bug --- src/dynamics/se/dycore/fvm_reconstruction_mod.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 index 13e1aa0cf2..51a432fba6 100644 --- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 +++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 @@ -1794,7 +1794,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWei ! fill in "n" on Figure above ! do halo=1,nhr - ! ftmp(:) = fcube(nc+1-halo,:) ! copy to a temporary + ftmp(:) = fcube(nc+1-halo,:) ! copy to a temporary do i=max(halo-nh,0),nht-halo ibaseref=ibase(i,halo,2) +nc ! From 8c08ece782f6c22b209aeea4440b77c9c270330f Mon Sep 17 00:00:00 2001 From: Peter Hjort Lauritzen Date: Thu, 23 Oct 2025 04:03:45 -0600 Subject: [PATCH 5/9] better vectorization for large NTRAC, ifdef for timers --- .../se/dycore/fvm_consistent_se_cslam.F90 | 57 ++++++++++--------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 index d3becc8446..3e56498b38 100644 --- a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 +++ b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 @@ -1,3 +1,4 @@ +#define FVM_TIMERS .FALSE. module fvm_consistent_se_cslam use shr_kind_mod, only: r8=>shr_kind_r8 use dimensions_mod, only: nc, nhe, nlev, ntrac, np, nhr, nhc, ngpc, ns, nht @@ -107,7 +108,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& endif kblk = kmax-kmin+1 - !call t_startf('fvm:before_Qnhc') + if(FVM_TIMERS) call t_startf('fvm:before_Qnhc') do ie=nets,nete do k=kmin,kmax elem(ie)%sub_elem_mass_flux(:,:,:,k) = dt_fvm*elem(ie)%sub_elem_mass_flux(:,:,:,k)*fvm(ie)%dp_ref_inverse(k) @@ -120,11 +121,11 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& call ghostpack(ghostbufQnhc,fvm(ie)%c(1-nhc:nc+nhc,1-nhc:nc+nhc,kmin:kmax,q),kblk,kptr,ie) enddo end do - !call t_stopf('fvm:before_Qnhc') - !call t_startf('fvm:ghost_exchange:Qnhc') + if(FVM_TIMERS) call t_stopf('fvm:before_Qnhc') + if(FVM_TIMERS) call t_startf('fvm:ghost_exchange:Qnhc') call ghost_exchange(hybridnew,ghostbufQnhc,location='ghostbufQnhc') - !call t_stopf('fvm:ghost_exchange:Qnhc') - !call t_startf('fvm:orthogonal_swept_areas') + if(FVM_TIMERS) call t_stopf('fvm:ghost_exchange:Qnhc') + if(FVM_TIMERS) call t_startf('fvm:orthogonal_swept_areas') do ie=nets,nete do k=kmin,kmax fvm(ie)%se_flux (1:nc,1:nc,:,k) = elem(ie)%sub_elem_mass_flux(:,:,:,k) @@ -152,14 +153,14 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& end do enddo - !call t_stopf('fvm:orthogonal_swept_areas') + if(FVM_TIMERS) call t_stopf('fvm:orthogonal_swept_areas') do ie=nets,nete ! Intel compiler version 2023.0.0 on derecho had significant slowdown on subroutine interface without ! these pointers. fcube => fvm(ie)%c(:,:,:,:) spherecentroid => fvm(ie)%spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe) do k=kmin,kmax - !call t_startf('FVM:tracers_reconstruct') + if(FVM_TIMERS) call t_startf('FVM:tracers_reconstruct') call reconstruction(fcube,nlev,k,& ctracer(:,:,:,:),irecons_tracer,llimiter,ntrac,& nc,nhe,nhr,nhc,nht,ns,nhr+(nhe-1),& @@ -170,10 +171,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& fvm(ie)%rot_matrix,fvm(ie)%centroid_stretch,& fvm(ie)%vertex_recons_weights,fvm(ie)%vtx_cart,& irecons_tracer_lev(k)) - !call t_stopf('FVM:tracers_reconstruct') - !call t_startf('fvm:swept_flux') + if(FVM_TIMERS) call t_stopf('FVM:tracers_reconstruct') + if(FVM_TIMERS) call t_startf('fvm:swept_flux') call swept_flux(elem(ie),fvm(ie),k,ctracer,irecons_tracer_lev(k),gsweights,gspts) - !call t_stopf('fvm:swept_flux') + if(FVM_TIMERS) call t_stopf('fvm:swept_flux') end do end do ! @@ -193,7 +194,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& ! ! if (large_Courant_incr) then - !call t_startf('fvm:fill_halo_fvm:large_Courant') + if(FVM_TIMERS) call t_startf('fvm:fill_halo_fvm:large_Courant') !if (kmin_jetkmax) then ! call endrun('ERROR: kmax_jet must be .le. kmax passed to run_consistent_se_cslam') !end if @@ -203,8 +204,8 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& kmax_jet_local = min(kmax_jet,kmax) klev = kmax_jet-kmin_jet+1 call fill_halo_fvm(ghostbufQ1,elem,fvm,hybridnew,nets,nete,1,kmin_jet_local,kmax_jet_local,klev,active=ActiveJetThread) - !call t_stopf('fvm:fill_halo_fvm:large_Courant') - !call t_startf('fvm:large_Courant_number_increment') + if(FVM_TIMERS) call t_stopf('fvm:fill_halo_fvm:large_Courant') + if(FVM_TIMERS) call t_startf('fvm:large_Courant_number_increment') if(ActiveJetThread) then do k=kmin_jet_local,kmax_jet_local !1,nlev do ie=nets,nete @@ -212,10 +213,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& end do end do endif - !call t_stopf('fvm:large_Courant_number_increment') + if(FVM_TIMERS) call t_stopf('fvm:large_Courant_number_increment') end if - !call t_startf('fvm:end_of_reconstruct_subroutine') + if(FVM_TIMERS) call t_startf('fvm:end_of_reconstruct_subroutine') do k=kmin,kmax ! ! convert to mixing ratio @@ -251,7 +252,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& elem(ie)%sub_elem_mass_flux(:,:,:,k)=0 end do end do - !call t_stopf('fvm:end_of_reconstruct_subroutine') + if(FVM_TIMERS) call t_stopf('fvm:end_of_reconstruct_subroutine') !$OMP END PARALLEL call omp_set_nested(.false.) end subroutine run_consistent_se_cslam @@ -281,7 +282,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt REAL(KIND=r8), dimension(2,8) :: x_start, dgam_vec REAL(KIND=r8) :: gamma_max, displ_first_guess - REAL(KIND=r8) :: flux,flux_tracer(ntrac) + REAL(KIND=r8) :: flux,flux_tracer(ntrac),w REAL(KIND=r8), dimension(num_area) :: dp_area @@ -306,7 +307,6 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! ! prepare for air/tracer update ! -! dp = fvm%dp_fvm(1-nhe:nc+nhe,1-nhe:nc+nhe,ilev) dp = fvm%dp_fvm(1-nhc:nc+nhc,1-nhc:nc+nhc,ilev) fvm%dp_fvm(1:nc,1:nc,ilev) = fvm%dp_fvm(1:nc,1:nc,ilev)*fvm%area_sphere do itr=1,ntrac @@ -538,14 +538,14 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! ! iterate to get flux area ! - !call t_startf('fvm:swept_area:get_gamma') + if(FVM_TIMERS) call t_startf('fvm:swept_area:get_gamma') do iarea=1,num_area dp_area(iarea) = dp(idx(1,iarea,i,j,iside),idx(2,iarea,i,j,iside)) end do call get_flux_segments_area_iterate(x,x_static,dx_static,dx,x_start,dgam_vec,num_seg,num_seg_static,& num_seg_max,num_area,dp_area,flowcase,gamma,mass_flux_se(i,j,iside),0.0_r8,gamma_max, & gsweights,gspts,ilev) - !call t_stopf('fvm:swept_area:get_gamma') + if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_gamma') ! ! pack segments for high-order weights computation ! @@ -560,10 +560,10 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! ! compute higher-order weights ! - !call t_startf('fvm:swept_area:get_high_order_w') + if(FVM_TIMERS) call t_startf('fvm:swept_area:get_high_order_w') call get_high_order_weights_over_areas(x,dx,num_seg,num_seg_max,num_area,weights,ngpc,& gsweights, gspts,irecons_tracer) - !call t_stopf('fvm:swept_area:get_high_order_w') + if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_high_order_w') ! !************************************************** ! @@ -571,16 +571,17 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! !************************************************** ! - !call t_startf('fvm:swept_area:remap') + if(FVM_TIMERS) call t_startf('fvm:swept_area:remap') flux=0.0_r8; flux_tracer=0.0_r8 do iarea=1,num_area if (num_seg(iarea)>0) then ii=idx(1,iarea,i,j,iside); jj=idx(2,iarea,i,j,iside) flux=flux+weights(1,iarea)*dp(ii,jj) - do itr=1,ntrac - do iw=1,irecons_tracer_actual - flux_tracer(itr) = flux_tracer(itr)+weights(iw,iarea)*ctracer(iw,ii,jj,itr) - end do + do iw=1,irecons_tracer_actual + w = weights(iw,iarea) + do itr=1,ntrac + flux_tracer(itr) = flux_tracer(itr)+w*ctracer(iw,ii,jj,itr) + end do end do end if end do @@ -614,7 +615,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt fvm%dp_fvm(i-1,j,ilev ) = fvm%dp_fvm(i-1,j,ilev )+flux fvm% c(i-1,j,ilev,1:ntrac) = fvm% c(i-1,j,ilev,1:ntrac)+flux_tracer(1:ntrac) end if - !call t_stopf('fvm:swept_area:remap') + if(FVM_TIMERS) call t_stopf('fvm:swept_area:remap') end if end do end do From 6aefddb7c615a4538d25653223862268e8e28b3c Mon Sep 17 00:00:00 2001 From: Peter Hjort Lauritzen Date: Thu, 23 Oct 2025 04:14:05 -0600 Subject: [PATCH 6/9] better performance for large tracer counts --- .../se/dycore/fvm_reconstruction_mod.F90 | 435 +++++++++--------- 1 file changed, 224 insertions(+), 211 deletions(-) diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 index 51a432fba6..fd3dbe4716 100644 --- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 +++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 @@ -79,7 +79,7 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& real (kind=r8), dimension(1-nht:nc+nht,1-nht:nc+nht,3) :: f - integer :: i,j,in,h,itr + integer :: i,j,in,h,itr,k integer, dimension(2,3) :: jx,jy if (present(irecons_actual_in)) then @@ -87,7 +87,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& else irecons_actual = irecons end if - jx(1,1)=jx_min(1); jx(2,1)=jx_max(1)-1 jx(1,2)=jx_min(2); jx(2,2)=jx_max(2)-1 @@ -119,42 +118,12 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& end do end if if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#1') - if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2') - ! - ! fill in non-existent (in physical space) corner values to simplify - ! logic in limiter code (min/max operation) - ! - do itr=1,ntrac_in - if (llimiter(itr)) then - if (cubeboundary>4) then - select case(cubeboundary) - case (nwest) - do h=1,nhe+1 - fcube(0,nc+h ,k_in,itr) = fcube(1-h,nc ,k_in,itr) - fcube(1-h,nc+1,k_in,itr) = fcube(1 ,nc+h,k_in,itr) - end do - case (swest) - do h=1,nhe+1 - fcube(1-h,0,k_in,itr) = fcube(1,1-h,k_in,itr) - fcube(0,1-h,k_in,itr) = fcube(1-h,1,k_in,itr) - end do - case (seast) - do h=1,nhe+1 - fcube(nc+h,0 ,k_in,itr) = fcube(nc,1-h,k_in,itr) - fcube(nc+1,1-h,k_in,itr) = fcube(nc+h,1,k_in,itr) - end do - case (neast) - do h=1,nhe+1 - fcube(nc+h,nc+1,k_in,itr) = fcube(nc,nc+h,k_in,itr) - fcube(nc+1,nc+h,k_in,itr) = fcube(nc+h,nc,k_in,itr) - end do - end select - end if - call slope_limiter(nhe,nc,nhc,fcube(:,:,k_in,itr),jx,jy,irecons,recons(:,:,:,itr),& - spherecentroid,& - recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual) - end if - end do + if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2') + + call slope_limiter(nhe,nc,nhc,fcube,jx,jy,k_in,nlev_in,ntrac_in,irecons,recons,& + spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),& + recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual,llimiter,& + cubeboundary) if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#2') end if if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#3') @@ -185,8 +154,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& end do case(6) do itr=1,ntrac_in -! do j=1-nhe,nc+nhe -! do i=1-nhe,nc+nhe do in=1,3 do j=jy(1,in),jy(2,in) do i=jx(1,in),jx(2,in) @@ -331,31 +298,67 @@ subroutine get_gradients(f,jx,jy,irecons,gradient,rot_matrix,centroid_stretch,nc end subroutine get_gradients - subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,recons_metrics,& - vertex_recons_weights,vtx_cart,irecons_actual) + subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,spherecentroid,recons_metrics,& + vertex_recons_weights,vtx_cart,irecons_actual,llimiter,cubeboundary) implicit none - integer , intent(in) :: irecons,nhe,nc,nhc,irecons_actual - real (kind=r8), dimension(1-nhc:, 1-nhc:) , intent(in) :: fcube - real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(inout):: recons + integer , intent(in) :: irecons_actual,k,nlev,ntrac + integer , intent(in) :: irecons,nhe,nc,nhc + real (kind=r8), dimension(1-nhc:nc+nhc,1-nhc:nc+nhc,nlev,ntrac) , intent(inout) :: fcube + real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe,ntrac), intent(inout):: recons integer, dimension(2,3) , intent(in) :: jx,jy real (kind=r8), dimension(irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(in) :: spherecentroid real (kind=r8), dimension(3,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(in) :: recons_metrics real (kind=r8), dimension(4,1:irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe), intent(in) :: vertex_recons_weights real (kind=r8), dimension(4,2,1-nhc:nc+nhc,1-nhc:nc+nhc) , intent(in) :: vtx_cart + logical, dimension(ntrac) , intent(in) :: llimiter + integer , intent(in) :: cubeboundary real (kind=r8):: minval_patch,maxval_patch real (kind=r8):: phi, min_val, max_val,disc + real (kind=r8):: v1,v2,v3,v4,vx1,vx2,vx3,vx4,vy1,vy2,vy3,vy4,r2,r3,r4,r5,r6,scx,scy,dx,dy,ex1,ex2,f0,val + real (kind=r8):: m1,m2,m3 real (kind=r8):: min_phi real (kind=r8):: extrema(2), xminmax(2),yminmax(2),extrema_value(13) real(kind=r8) :: invtmp ! temporary to pre-compute inverses - integer :: itmp1,itmp2,i,j,in,vertex,n + integer :: itmp1,itmp2,i,j,in,vertex,n,itr,h -! real (kind=r8), dimension(-1:5) :: diff_value real (kind=r8), dimension(-1:1) :: minval_array, maxval_array real (kind=r8), parameter :: threshold = 1.0E-40_r8 - character(len=128) :: errormsg + character(len=128) :: errormsg + integer :: im1,jm1,ip1,jp1 + ! + ! fill in non-existent (in physical space) corner values to simplify + ! logic in limiter code (min/max operation) + ! + do itr=1,ntrac + if (.not. llimiter(itr)) cycle + if (cubeboundary>4) then + select case(cubeboundary) + case (nwest) + do h=1,nhe+1 + fcube(0,nc+h ,k,itr) = fcube(1-h,nc ,k,itr) + fcube(1-h,nc+1,k,itr) = fcube(1 ,nc+h,k,itr) + end do + case (swest) + do h=1,nhe+1 + fcube(1-h,0,k,itr) = fcube(1,1-h,k,itr) + fcube(0,1-h,k,itr) = fcube(1-h,1,k,itr) + end do + case (seast) + do h=1,nhe+1 + fcube(nc+h,0 ,k,itr) = fcube(nc,1-h,k,itr) + fcube(nc+1,1-h,k,itr) = fcube(nc+h,1,k,itr) + end do + case (neast) + do h=1,nhe+1 + fcube(nc+h,nc+1,k,itr) = fcube(nc,nc+h,k,itr) + fcube(nc+1,nc+h,k,itr) = fcube(nc+h,nc,k,itr) + end do + end select + end if + end do select case (irecons_actual) ! @@ -365,52 +368,51 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re do in=1,3 do j=jy(1,in),jy(2,in) do i=jx(1,in),jx(2,in) - !rck combined min/max and unrolled inner loop - !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1)) - !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1)) - !DIR$ SIMD - do itmp2=-1,+1 - itmp1 = j+itmp2 - minval_array(itmp2) = min(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1)) - maxval_array(itmp2) = max(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1)) - enddo - minval_patch = min(minval_array(-1),minval_array(0),minval_array(1)) - maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1)) - - min_phi=1.0_r8 - - ! - ! coordinate bounds (could be pre-computed!) - ! - xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) - xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) - yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) - yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) - - !rck restructured loop - !DIR$ SIMD - do vertex=1,4 - call recons_val_cart_plm(fcube(i,j), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), & - recons(1:3,i,j), extrema_value(vertex)) + do itr = 1, ntrac + if (.not. llimiter(itr)) cycle + !rck combined min/max and unrolled inner loop + !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1)) + !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1)) + !DIR$ SIMD + do itmp2=-1,+1 + itmp1 = j+itmp2 + minval_array(itmp2) = min(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr)) + maxval_array(itmp2) = max(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr)) + enddo + minval_patch = min(minval_array(-1),minval_array(0),minval_array(1)) + maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1)) + min_phi=1.0_r8 + ! + ! coordinate bounds (could be pre-computed!) + ! + xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) + xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) + yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) + yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) + !rck restructured loop + !DIR$ SIMD + do vertex=1,4 + call recons_val_cart_plm(fcube(i,j,k,itr), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), & + recons(1:3,i,j,itr), extrema_value(vertex)) + end do + max_val = MAXVAL(extrema_value(1:4)) + min_val = MINVAL(extrema_value(1:4)) + if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then + phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr)) + if (phithreshold) then + phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr)) + if (phimaxval_patch.and.abs(max_val-fcube(i,j))>threshold) then - phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j)) - if (phithreshold) then - phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j)) - if (phi threshold) then - extrema(1) = recons(6,i,j) * recons(3,i,j) - 2.0_r8 * recons(5,i,j) * recons(2,i,j) - extrema(2) = recons(6,i,j) * recons(2,i,j) - 2.0_r8 * recons(4,i,j) * recons(3,i,j) - - disc=1.0_r8/disc - extrema(1) = extrema(1) * disc + spherecentroid(1,i,j) - extrema(2) = extrema(2) * disc + spherecentroid(2,i,j) - if ( (extrema(1) - xminmax(1) > -threshold) .and. & !xmin - (extrema(1) - xminmax(2) < threshold) .and. & !xmax - (extrema(2) - yminmax(1) > -threshold) .and. & !ymin - (extrema(2) - yminmax(2) < threshold)) then !ymax - call recons_val_cart(fcube(i,j), extrema(1), extrema(2), spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(5)) - endif - endif - ! - ! Check all potential minimizer points along element boundaries - ! - if (abs(recons(6,i,j)) > threshold) then - invtmp = 1.0_r8 / (recons(6,i,j) + spherecentroid(2,i,j)) - do n=1,2 - ! Left edge, intercept with du/dx = 0 - extrema(2) = invtmp * (-recons(2,i,j) - 2.0_r8 * recons(4,i,j) * (xminmax(n) - spherecentroid(1,i,j))) - if ((extrema(2) > yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(5+n)) - endif - enddo - ! Top/bottom edge, intercept with du/dy = 0 - invtmp = 1.0_r8 / recons(6,i,j) + spherecentroid(1,i,j) - do n = 1,2 - extrema(1) = invtmp * (-recons(3,i,j) - 2.0_r8 * recons(5,i,j) * (yminmax(n) - spherecentroid(2,i,j))) - if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), extrema(1), yminmax(n),spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(7+n)) - endif - enddo - endif - - ! Top/bottom edge, y=const., du/dx=0 - if (abs(recons(4,i,j)) > threshold) then - invtmp = 1.0_r8 / (2.0_r8 * recons(4,i,j))! + spherecentroid(1,i,j) - do n = 1,2 - extrema(1) = spherecentroid(1,i,j)+& - invtmp * (-recons(2,i,j) - recons(6,i,j) * (yminmax(n) - spherecentroid(2,i,j))) - - if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), extrema(1), yminmax(n), spherecentroid(:,i,j),& - recons_metrics(:,i,j),recons(:,i,j), extrema_value(9+n)) - endif - enddo - endif - ! Left/right edge, x=const., du/dy=0 - if (abs(recons(5,i,j)) > threshold) then - invtmp = 1.0_r8 / (2.0_r8 * recons(5,i,j)) - do n = 1,2 - extrema(2) = spherecentroid(2,i,j)+& - invtmp * (-recons(3,i,j) - recons(6,i,j) * (xminmax(n) - spherecentroid(1,i,j))) - - if ((extrema(2)>yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(11+n)) - endif + vx1 = vtx_cart(1,1,i,j); vy1 = vtx_cart(1,2,i,j) + vx2 = vtx_cart(2,1,i,j); vy2 = vtx_cart(2,2,i,j) + vx3 = vtx_cart(3,1,i,j); vy3 = vtx_cart(3,2,i,j) + vx4 = vtx_cart(4,1,i,j); vy4 = vtx_cart(4,2,i,j) + xminmax(1) = min(vx1,vx2,vx3,vx4) + xminmax(2) = max(vx1,vx2,vx3,vx4) + yminmax(1) = min(vy1,vy2,vy3,vy4) + yminmax(2) = max(vy1,vy2,vy3,vy4) + scx = spherecentroid(1,i,j) + scy = spherecentroid(2,i,j) + m1 = recons_metrics(1,i,j) + m2 = recons_metrics(2,i,j) + m3 = recons_metrics(3,i,j) + im1 = i-1; ip1 = i+1 + jm1 = j-1; jp1 = j+1 + do itr = 1, ntrac + if (.not. llimiter(itr)) cycle + !rck combined min/max and unrolled inner loop + !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1)) + !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1)) + minval_patch = min( min( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), & + min( fcube(im1,j ,k,itr), fcube(i ,j ,k,itr), fcube(ip1,j ,k,itr) ), & + min( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) ) + maxval_patch = max( max( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), & + max( fcube(im1,j ,k,itr), fcube(i ,j ,k,itr), fcube(ip1,j ,k,itr) ), & + max( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) ) + min_phi=1.0_r8 + + f0 = fcube(i,j,k,itr) + min_val = f0; max_val = f0!initialize min/max + ! + ! compute min/max value at cell corners + !DIR$ SIMD + do vertex=1,4 + val = f0 + do itmp1=1,irecons-1 + val = val + recons(itmp1+1,i,j,itr)*vertex_recons_weights(vertex,itmp1,i,j) + enddo + min_val = min(min_val,val) + max_val = max(max_val,val) enddo - endif - !rck - combined min/max calculation and unrolled - ! max_val = MAXVAL(extrema_value) - ! min_val = MINVAL(extrema_value) - max_val = extrema_value(13) - min_val = extrema_value(13) - do itmp1 = 1,12,4 - max_val = max(max_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3)) - min_val = min(min_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3)) - enddo - !rck - - if (max_val>maxval_patch.and.abs(max_val-fcube(i,j))>threshold) then - phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j)) - if (phithreshold) then - phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j)) - if (phi threshold) then +!not b4b ex1 = (r6*r3 - 2.0_r8*r5*r2) / disc + scx +!not b4b ex2 = (r6*r2 - 2.0_r8*r4*r3) / disc + scy + ex1 = (r6*r3 - 2.0_r8*r5*r2) + ex2 = (r6*r2 - 2.0_r8*r4*r3) + disc = 1 /disc + ex1 = ex1*disc+scx + ex2 = ex2*disc+scy + if ( ex1 > xminmax(1)-threshold .and. ex1 < xminmax(2)+threshold .and. & + ex2 > yminmax(1)-threshold .and. ex2 < yminmax(2)+threshold ) then + dx = ex1 - scx; dy = ex2 - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + end if + endif + ! + ! Check all potential minimizer points along element boundaries + ! + if (abs(r6) > threshold) then + invtmp = 1.0_r8 / (r6 + scy) + do n=1,2 + ! Left edge, intercept with du/dx = 0 + ex2 = invtmp * (-r2 - 2.0_r8 * r4 * (xminmax(n) - scx)) + if ((ex2 > yminmax(1)-threshold) .and. (ex2 < yminmax(2)+threshold)) then + dx = xminmax(n) - scx; dy = ex2 - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + endif + enddo + ! + ! Top/bottom edge, intercept with du/dy = 0 + ! + invtmp = 1.0_r8 / r6 + scx + do n = 1,2 + ex1 = invtmp * (-r3 - 2.0_r8 * r5 * (yminmax(n) - scy)) + if ( (ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold) ) then + dx = ex1 - scx; dy = yminmax(n) - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + endif + enddo + endif + ! + ! Top/bottom edge, y=const., du/dx=0 + ! + if (abs(r4) > threshold) then + invtmp = 1.0_r8 / (2.0_r8 * r4)! + spherecentroid(1,i,j) + do n = 1,2 + ex1 = scx+invtmp * (-r2 - r6 * (yminmax(n) - scy)) + if ((ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold)) then + dx = ex1 - scx; dy = yminmax(n) - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + endif + enddo + endif + ! + ! Top/bottom edge, y=const., du/dx=0 + ! + if (abs(r5) > threshold) then + invtmp = 1.0_r8 / (2.0_r8 * r5)! + spherecentroid(1,i,j) + do n = 1,2 + ex1 = scy+invtmp * (-r3 - r6 * (xminmax(n) - scx)) + if ((ex1 > yminmax(1)-threshold) .and. (ex1 < yminmax(2)+threshold)) then + dx = xminmax(n) - scx; dy = ex1 - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + endif + enddo + endif + ! + if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then + phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr)) + if (phithreshold) then + phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr)) + if (phi Date: Mon, 24 Nov 2025 06:48:50 -0700 Subject: [PATCH 7/9] remove buggy code (answer changes!) --- .../se/dycore/fvm_reconstruction_mod.F90 | 35 +++---------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 index fd3dbe4716..8b425042be 100644 --- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 +++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 @@ -472,8 +472,6 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,sphe ! DO NOT NEED ABS here, if disc<0 we have a saddle point (no maximum or minimum) disc = 4.0_r8 * r4 *r5 - r6*r6 if (abs(disc) > threshold) then -!not b4b ex1 = (r6*r3 - 2.0_r8*r5*r2) / disc + scx -!not b4b ex2 = (r6*r2 - 2.0_r8*r4*r3) / disc + scy ex1 = (r6*r3 - 2.0_r8*r5*r2) ex2 = (r6*r2 - 2.0_r8*r4*r3) disc = 1 /disc @@ -490,37 +488,12 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,sphe ! ! Check all potential minimizer points along element boundaries ! - if (abs(r6) > threshold) then - invtmp = 1.0_r8 / (r6 + scy) - do n=1,2 - ! Left edge, intercept with du/dx = 0 - ex2 = invtmp * (-r2 - 2.0_r8 * r4 * (xminmax(n) - scx)) - if ((ex2 > yminmax(1)-threshold) .and. (ex2 < yminmax(2)+threshold)) then - dx = xminmax(n) - scx; dy = ex2 - scy - v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) - max_val = max(max_val, v1) - min_val = min(min_val, v1) - endif - enddo - ! - ! Top/bottom edge, intercept with du/dy = 0 - ! - invtmp = 1.0_r8 / r6 + scx - do n = 1,2 - ex1 = invtmp * (-r3 - 2.0_r8 * r5 * (yminmax(n) - scy)) - if ( (ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold) ) then - dx = ex1 - scx; dy = yminmax(n) - scy - v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) - max_val = max(max_val, v1) - min_val = min(min_val, v1) - endif - enddo - endif + ! ! Top/bottom edge, y=const., du/dx=0 ! if (abs(r4) > threshold) then - invtmp = 1.0_r8 / (2.0_r8 * r4)! + spherecentroid(1,i,j) + invtmp = 1.0_r8 / (2.0_r8 * r4) do n = 1,2 ex1 = scx+invtmp * (-r2 - r6 * (yminmax(n) - scy)) if ((ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold)) then @@ -532,10 +505,10 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,sphe enddo endif ! - ! Top/bottom edge, y=const., du/dx=0 + ! Left/right edge, x=const., du/dy=0 ! if (abs(r5) > threshold) then - invtmp = 1.0_r8 / (2.0_r8 * r5)! + spherecentroid(1,i,j) + invtmp = 1.0_r8 / (2.0_r8 * r5) do n = 1,2 ex1 = scy+invtmp * (-r3 - r6 * (xminmax(n) - scx)) if ((ex1 > yminmax(1)-threshold) .and. (ex1 < yminmax(2)+threshold)) then From 7b690fe3d6985ea9384ae288af491b1d12523e3b Mon Sep 17 00:00:00 2001 From: Jesse Nusbaumer Date: Tue, 25 Nov 2025 09:36:01 -0700 Subject: [PATCH 8/9] Remove dead config code originally used by Eulerian dycore. --- cime_config/config_pes.xml | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/cime_config/config_pes.xml b/cime_config/config_pes.xml index fe5db44ecf..4d6beab0ba 100644 --- a/cime_config/config_pes.xml +++ b/cime_config/config_pes.xml @@ -761,43 +761,6 @@ - - - - none - - 64 - 64 - 64 - 64 - 64 - 64 - 64 - 64 - - - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - - - 0 - 0 - 0 - 0 - 0 - 0 - 0 - 0 - - - - From 85a3126ddb9ae2111bf2c40271b89d34ae6717e7 Mon Sep 17 00:00:00 2001 From: Cheryl Craig Date: Wed, 26 Nov 2025 11:36:30 -0700 Subject: [PATCH 9/9] Update ChangeLog for cam6_4_131 --- doc/ChangeLog | 65 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/doc/ChangeLog b/doc/ChangeLog index 5317e0dba9..897856ed6d 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,13 +1,13 @@ =============================================================== -Tag name: cam6_ -Originator(s): johnmauff, pel -Date: April 28, 2025 -One-line Summary: -Github PR URL: https://github.com/ESCOMP/CAM/pull/ +Tag name: cam6_4_131 +Originator(s): johnmauff, pel, cacraig, nusbaume +Date: Nov 26, 2025 +One-line Summary: Performance improvements for CSLAM +Github PR URL: https://github.com/ESCOMP/CAM/pull/1365 Purpose of changes (include the issue number and title text for each relevant GitHub issue): - Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1316 + Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1360 The subroutine extend_panel_interpolate is written such that the compiler will generate more data movement than is necessary. This excessive data movement intensifies a computational load imbalance in the CSLAM advection. While it is impossible to eliminate @@ -22,7 +22,7 @@ List any changes to the defaults for the boundary datasets: N/A Describe any substantial timing or memory changes: N/A -Code reviewed by: +Code reviewed by: nusbaume List all files eliminated: N/A @@ -30,10 +30,57 @@ List all files added and what they do: N/A List all existing files that have been modified, and describe the changes: -.... +M cime_config/config_pes.xml + - remove dead config code originally used by Eulerian dycore + +M src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 +M src/dynamics/se/dycore/fvm_reconstruction_mod.F90 + - mods for SE dycore as described above + +If there were any failures reported from running test_driver.sh on any test +platform, and checkin with these failures has been OK'd by the gatekeeper, +then copy the lines from the td.*.status files for the failed tests to the +appropriate machine below. All failed tests must be justified. + +derecho/intel/aux_cam: + SMS_D_Ln9_P1536x1.ne0CONUSne30x8_ne0CONUSne30x8_mt12.FCHIST.derecho_intel.cam-outfrq9s (Overall: FAIL) details: + - intermittent failure in CTSM code (lnd_set_decomp_and_domain.F90) + + ERC_D_Ln9.ne30pg2_ne30pg2_mt232.QPC7.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERC_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_LTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERI_D_Ln18.ne16pg3_ne16pg3_mt232.FHIST_C4.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details: + ERI_D_Ln18.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details: + ERP_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERP_Ld3.ne16pg3_ne16pg3_mg17.FHISTC_WAt1ma.derecho_intel.cam-reduced_hist1d (Overall: DIFF) details: + ERP_Ld3.ne30pg3_ne30pg3_mt232.FHISTC_MTt4s.derecho_intel.cam-outfrq1d_aoa (Overall: DIFF) details: + ERP_Ln9.ne30pg3_ne30pg3_mg17.FCnudged.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERP_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WAma.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERR_Ln9.ne16pg3_ne16pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq9s_bwic (Overall: DIFF) details: + ERS_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s_ctem (Overall: DIFF) details: + SMS_C2_D_Ln9.ne16pg3_ne16pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + SMS_D_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + SMS_D_Ln9_P1280x1.ne30pg3_ne30pg3_mt232.FHISTC_MTt1s.derecho_intel.cam-outfrq9s_Leung_dust (Overall: DIFF) details: + SMS_Ld1.ne30pg3_ne30pg3_mg17.FC2010climo.derecho_intel.cam-outfrq1d (Overall: DIFF) details: + SMS_Ln9.ne30pg3_ne30pg3_mg17.FW2000climo.derecho_intel.cam-outfrq9s_rrtmgp (Overall: DIFF) details: + - answer differences for CSLAM runs + +derecho/nvhpc/aux_cam: + ERS_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_nvhpc.cam-outfrq9s_gpu_default (Overall: FAIL) details: + - timing issue - Jian has determined this is due to changes on derecho with the the last upgrade. He has + reported the issue to CISL. Note cam6_4_128 is the last CAM tag with baselines to use for comparison + but answer changes are expected starting with cam6_4_130 + +izumi/nag/aux_cam: + ERC_D_Ln27.ne3pg3_ne3pg3_mt232.FKESSLER.izumi_nag.cam-outfrq9s (Overall: DIFF) details: + ERC_D_Ln9.ne3pg3_ne3pg3_mt232.FHISTC_LTso.izumi_nag.cam-cosp_rad_diags (Overall: DIFF) details: + SMS_D_Ln3.ne5pg3_ne5pg3_mg37.QPX2000.izumi_nag.cam-outfrq3s (Overall: DIFF) details: + - answer differences for CSLAM runs + + +izumi/gnu/aux_cam: all BFB Summarize any changes to answers: - No answer changes, all b4b + Answer changing, bug not climate changing as reported by pel ===============================================================