diff --git a/cime_config/config_pes.xml b/cime_config/config_pes.xml
index 7b9b5793f2..7c8c2b95b8 100644
--- a/cime_config/config_pes.xml
+++ b/cime_config/config_pes.xml
@@ -572,43 +572,6 @@
-
-
-
- none
-
- 64
- 64
- 64
- 64
- 64
- 64
- 64
- 64
-
-
- 1
- 1
- 1
- 1
- 1
- 1
- 1
- 1
-
-
- 0
- 0
- 0
- 0
- 0
- 0
- 0
- 0
-
-
-
-
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 980f22d746..897856ed6d 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,86 @@
+===============================================================
+
+Tag name: cam6_4_131
+Originator(s): johnmauff, pel, cacraig, nusbaume
+Date: Nov 26, 2025
+One-line Summary: Performance improvements for CSLAM
+Github PR URL: https://github.com/ESCOMP/CAM/pull/1365
+
+Purpose of changes (include the issue number and title text for each relevant GitHub issue):
+ Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1360
+
+ The subroutine extend_panel_interpolate is written such that the compiler will generate more data movement than is necessary.
+ This excessive data movement intensifies a computational load imbalance in the CSLAM advection. While it is impossible to eliminate
+ the load imbalance that is caused by the special treatment of panels at the corners of the cubed sphere, it is possible to reduce
+ the cost of this subroutine by changing the way that the subroutine is written.
+
+Describe any changes made to build system: N/A
+
+Describe any changes made to the namelist: N/A
+
+List any changes to the defaults for the boundary datasets: N/A
+
+Describe any substantial timing or memory changes: N/A
+
+Code reviewed by: nusbaume
+
+List all files eliminated: N/A
+
+List all files added and what they do: N/A
+
+List all existing files that have been modified, and describe the changes:
+
+M cime_config/config_pes.xml
+ - remove dead config code originally used by Eulerian dycore
+
+M src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
+M src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+ - mods for SE dycore as described above
+
+If there were any failures reported from running test_driver.sh on any test
+platform, and checkin with these failures has been OK'd by the gatekeeper,
+then copy the lines from the td.*.status files for the failed tests to the
+appropriate machine below. All failed tests must be justified.
+
+derecho/intel/aux_cam:
+ SMS_D_Ln9_P1536x1.ne0CONUSne30x8_ne0CONUSne30x8_mt12.FCHIST.derecho_intel.cam-outfrq9s (Overall: FAIL) details:
+ - intermittent failure in CTSM code (lnd_set_decomp_and_domain.F90)
+
+ ERC_D_Ln9.ne30pg2_ne30pg2_mt232.QPC7.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+ ERC_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_LTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+ ERI_D_Ln18.ne16pg3_ne16pg3_mt232.FHIST_C4.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details:
+ ERI_D_Ln18.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details:
+ ERP_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+ ERP_Ld3.ne16pg3_ne16pg3_mg17.FHISTC_WAt1ma.derecho_intel.cam-reduced_hist1d (Overall: DIFF) details:
+ ERP_Ld3.ne30pg3_ne30pg3_mt232.FHISTC_MTt4s.derecho_intel.cam-outfrq1d_aoa (Overall: DIFF) details:
+ ERP_Ln9.ne30pg3_ne30pg3_mg17.FCnudged.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+ ERP_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WAma.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+ ERR_Ln9.ne16pg3_ne16pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq9s_bwic (Overall: DIFF) details:
+ ERS_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s_ctem (Overall: DIFF) details:
+ SMS_C2_D_Ln9.ne16pg3_ne16pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+ SMS_D_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+ SMS_D_Ln9_P1280x1.ne30pg3_ne30pg3_mt232.FHISTC_MTt1s.derecho_intel.cam-outfrq9s_Leung_dust (Overall: DIFF) details:
+ SMS_Ld1.ne30pg3_ne30pg3_mg17.FC2010climo.derecho_intel.cam-outfrq1d (Overall: DIFF) details:
+ SMS_Ln9.ne30pg3_ne30pg3_mg17.FW2000climo.derecho_intel.cam-outfrq9s_rrtmgp (Overall: DIFF) details:
+ - answer differences for CSLAM runs
+
+derecho/nvhpc/aux_cam:
+ ERS_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_nvhpc.cam-outfrq9s_gpu_default (Overall: FAIL) details:
+ - timing issue - Jian has determined this is due to changes on derecho with the the last upgrade. He has
+ reported the issue to CISL. Note cam6_4_128 is the last CAM tag with baselines to use for comparison
+ but answer changes are expected starting with cam6_4_130
+
+izumi/nag/aux_cam:
+ ERC_D_Ln27.ne3pg3_ne3pg3_mt232.FKESSLER.izumi_nag.cam-outfrq9s (Overall: DIFF) details:
+ ERC_D_Ln9.ne3pg3_ne3pg3_mt232.FHISTC_LTso.izumi_nag.cam-cosp_rad_diags (Overall: DIFF) details:
+ SMS_D_Ln3.ne5pg3_ne5pg3_mg37.QPX2000.izumi_nag.cam-outfrq3s (Overall: DIFF) details:
+ - answer differences for CSLAM runs
+
+
+izumi/gnu/aux_cam: all BFB
+
+Summarize any changes to answers:
+ Answer changing, bug not climate changing as reported by pel
===============================================================
diff --git a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
index d3becc8446..3e56498b38 100644
--- a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
+++ b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
@@ -1,3 +1,4 @@
+#define FVM_TIMERS .FALSE.
module fvm_consistent_se_cslam
use shr_kind_mod, only: r8=>shr_kind_r8
use dimensions_mod, only: nc, nhe, nlev, ntrac, np, nhr, nhc, ngpc, ns, nht
@@ -107,7 +108,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
endif
kblk = kmax-kmin+1
- !call t_startf('fvm:before_Qnhc')
+ if(FVM_TIMERS) call t_startf('fvm:before_Qnhc')
do ie=nets,nete
do k=kmin,kmax
elem(ie)%sub_elem_mass_flux(:,:,:,k) = dt_fvm*elem(ie)%sub_elem_mass_flux(:,:,:,k)*fvm(ie)%dp_ref_inverse(k)
@@ -120,11 +121,11 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
call ghostpack(ghostbufQnhc,fvm(ie)%c(1-nhc:nc+nhc,1-nhc:nc+nhc,kmin:kmax,q),kblk,kptr,ie)
enddo
end do
- !call t_stopf('fvm:before_Qnhc')
- !call t_startf('fvm:ghost_exchange:Qnhc')
+ if(FVM_TIMERS) call t_stopf('fvm:before_Qnhc')
+ if(FVM_TIMERS) call t_startf('fvm:ghost_exchange:Qnhc')
call ghost_exchange(hybridnew,ghostbufQnhc,location='ghostbufQnhc')
- !call t_stopf('fvm:ghost_exchange:Qnhc')
- !call t_startf('fvm:orthogonal_swept_areas')
+ if(FVM_TIMERS) call t_stopf('fvm:ghost_exchange:Qnhc')
+ if(FVM_TIMERS) call t_startf('fvm:orthogonal_swept_areas')
do ie=nets,nete
do k=kmin,kmax
fvm(ie)%se_flux (1:nc,1:nc,:,k) = elem(ie)%sub_elem_mass_flux(:,:,:,k)
@@ -152,14 +153,14 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
end do
enddo
- !call t_stopf('fvm:orthogonal_swept_areas')
+ if(FVM_TIMERS) call t_stopf('fvm:orthogonal_swept_areas')
do ie=nets,nete
! Intel compiler version 2023.0.0 on derecho had significant slowdown on subroutine interface without
! these pointers.
fcube => fvm(ie)%c(:,:,:,:)
spherecentroid => fvm(ie)%spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe)
do k=kmin,kmax
- !call t_startf('FVM:tracers_reconstruct')
+ if(FVM_TIMERS) call t_startf('FVM:tracers_reconstruct')
call reconstruction(fcube,nlev,k,&
ctracer(:,:,:,:),irecons_tracer,llimiter,ntrac,&
nc,nhe,nhr,nhc,nht,ns,nhr+(nhe-1),&
@@ -170,10 +171,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
fvm(ie)%rot_matrix,fvm(ie)%centroid_stretch,&
fvm(ie)%vertex_recons_weights,fvm(ie)%vtx_cart,&
irecons_tracer_lev(k))
- !call t_stopf('FVM:tracers_reconstruct')
- !call t_startf('fvm:swept_flux')
+ if(FVM_TIMERS) call t_stopf('FVM:tracers_reconstruct')
+ if(FVM_TIMERS) call t_startf('fvm:swept_flux')
call swept_flux(elem(ie),fvm(ie),k,ctracer,irecons_tracer_lev(k),gsweights,gspts)
- !call t_stopf('fvm:swept_flux')
+ if(FVM_TIMERS) call t_stopf('fvm:swept_flux')
end do
end do
!
@@ -193,7 +194,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
!
!
if (large_Courant_incr) then
- !call t_startf('fvm:fill_halo_fvm:large_Courant')
+ if(FVM_TIMERS) call t_startf('fvm:fill_halo_fvm:large_Courant')
!if (kmin_jetkmax) then
! call endrun('ERROR: kmax_jet must be .le. kmax passed to run_consistent_se_cslam')
!end if
@@ -203,8 +204,8 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
kmax_jet_local = min(kmax_jet,kmax)
klev = kmax_jet-kmin_jet+1
call fill_halo_fvm(ghostbufQ1,elem,fvm,hybridnew,nets,nete,1,kmin_jet_local,kmax_jet_local,klev,active=ActiveJetThread)
- !call t_stopf('fvm:fill_halo_fvm:large_Courant')
- !call t_startf('fvm:large_Courant_number_increment')
+ if(FVM_TIMERS) call t_stopf('fvm:fill_halo_fvm:large_Courant')
+ if(FVM_TIMERS) call t_startf('fvm:large_Courant_number_increment')
if(ActiveJetThread) then
do k=kmin_jet_local,kmax_jet_local !1,nlev
do ie=nets,nete
@@ -212,10 +213,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
end do
end do
endif
- !call t_stopf('fvm:large_Courant_number_increment')
+ if(FVM_TIMERS) call t_stopf('fvm:large_Courant_number_increment')
end if
- !call t_startf('fvm:end_of_reconstruct_subroutine')
+ if(FVM_TIMERS) call t_startf('fvm:end_of_reconstruct_subroutine')
do k=kmin,kmax
!
! convert to mixing ratio
@@ -251,7 +252,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
elem(ie)%sub_elem_mass_flux(:,:,:,k)=0
end do
end do
- !call t_stopf('fvm:end_of_reconstruct_subroutine')
+ if(FVM_TIMERS) call t_stopf('fvm:end_of_reconstruct_subroutine')
!$OMP END PARALLEL
call omp_set_nested(.false.)
end subroutine run_consistent_se_cslam
@@ -281,7 +282,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
REAL(KIND=r8), dimension(2,8) :: x_start, dgam_vec
REAL(KIND=r8) :: gamma_max, displ_first_guess
- REAL(KIND=r8) :: flux,flux_tracer(ntrac)
+ REAL(KIND=r8) :: flux,flux_tracer(ntrac),w
REAL(KIND=r8), dimension(num_area) :: dp_area
@@ -306,7 +307,6 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
!
! prepare for air/tracer update
!
-! dp = fvm%dp_fvm(1-nhe:nc+nhe,1-nhe:nc+nhe,ilev)
dp = fvm%dp_fvm(1-nhc:nc+nhc,1-nhc:nc+nhc,ilev)
fvm%dp_fvm(1:nc,1:nc,ilev) = fvm%dp_fvm(1:nc,1:nc,ilev)*fvm%area_sphere
do itr=1,ntrac
@@ -538,14 +538,14 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
!
! iterate to get flux area
!
- !call t_startf('fvm:swept_area:get_gamma')
+ if(FVM_TIMERS) call t_startf('fvm:swept_area:get_gamma')
do iarea=1,num_area
dp_area(iarea) = dp(idx(1,iarea,i,j,iside),idx(2,iarea,i,j,iside))
end do
call get_flux_segments_area_iterate(x,x_static,dx_static,dx,x_start,dgam_vec,num_seg,num_seg_static,&
num_seg_max,num_area,dp_area,flowcase,gamma,mass_flux_se(i,j,iside),0.0_r8,gamma_max, &
gsweights,gspts,ilev)
- !call t_stopf('fvm:swept_area:get_gamma')
+ if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_gamma')
!
! pack segments for high-order weights computation
!
@@ -560,10 +560,10 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
!
! compute higher-order weights
!
- !call t_startf('fvm:swept_area:get_high_order_w')
+ if(FVM_TIMERS) call t_startf('fvm:swept_area:get_high_order_w')
call get_high_order_weights_over_areas(x,dx,num_seg,num_seg_max,num_area,weights,ngpc,&
gsweights, gspts,irecons_tracer)
- !call t_stopf('fvm:swept_area:get_high_order_w')
+ if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_high_order_w')
!
!**************************************************
!
@@ -571,16 +571,17 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
!
!**************************************************
!
- !call t_startf('fvm:swept_area:remap')
+ if(FVM_TIMERS) call t_startf('fvm:swept_area:remap')
flux=0.0_r8; flux_tracer=0.0_r8
do iarea=1,num_area
if (num_seg(iarea)>0) then
ii=idx(1,iarea,i,j,iside); jj=idx(2,iarea,i,j,iside)
flux=flux+weights(1,iarea)*dp(ii,jj)
- do itr=1,ntrac
- do iw=1,irecons_tracer_actual
- flux_tracer(itr) = flux_tracer(itr)+weights(iw,iarea)*ctracer(iw,ii,jj,itr)
- end do
+ do iw=1,irecons_tracer_actual
+ w = weights(iw,iarea)
+ do itr=1,ntrac
+ flux_tracer(itr) = flux_tracer(itr)+w*ctracer(iw,ii,jj,itr)
+ end do
end do
end if
end do
@@ -614,7 +615,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
fvm%dp_fvm(i-1,j,ilev ) = fvm%dp_fvm(i-1,j,ilev )+flux
fvm% c(i-1,j,ilev,1:ntrac) = fvm% c(i-1,j,ilev,1:ntrac)+flux_tracer(1:ntrac)
end if
- !call t_stopf('fvm:swept_area:remap')
+ if(FVM_TIMERS) call t_stopf('fvm:swept_area:remap')
end if
end do
end do
diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
index b4708dfd3b..8b425042be 100644
--- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
@@ -79,7 +79,7 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
real (kind=r8), dimension(1-nht:nc+nht,1-nht:nc+nht,3) :: f
- integer :: i,j,in,h,itr
+ integer :: i,j,in,h,itr,k
integer, dimension(2,3) :: jx,jy
if (present(irecons_actual_in)) then
@@ -87,7 +87,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
else
irecons_actual = irecons
end if
-
jx(1,1)=jx_min(1); jx(2,1)=jx_max(1)-1
jx(1,2)=jx_min(2); jx(2,2)=jx_max(2)-1
@@ -119,42 +118,12 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
end do
end if
if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#1')
- if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2')
- !
- ! fill in non-existent (in physical space) corner values to simplify
- ! logic in limiter code (min/max operation)
- !
- do itr=1,ntrac_in
- if (llimiter(itr)) then
- if (cubeboundary>4) then
- select case(cubeboundary)
- case (nwest)
- do h=1,nhe+1
- fcube(0,nc+h ,k_in,itr) = fcube(1-h,nc ,k_in,itr)
- fcube(1-h,nc+1,k_in,itr) = fcube(1 ,nc+h,k_in,itr)
- end do
- case (swest)
- do h=1,nhe+1
- fcube(1-h,0,k_in,itr) = fcube(1,1-h,k_in,itr)
- fcube(0,1-h,k_in,itr) = fcube(1-h,1,k_in,itr)
- end do
- case (seast)
- do h=1,nhe+1
- fcube(nc+h,0 ,k_in,itr) = fcube(nc,1-h,k_in,itr)
- fcube(nc+1,1-h,k_in,itr) = fcube(nc+h,1,k_in,itr)
- end do
- case (neast)
- do h=1,nhe+1
- fcube(nc+h,nc+1,k_in,itr) = fcube(nc,nc+h,k_in,itr)
- fcube(nc+1,nc+h,k_in,itr) = fcube(nc+h,nc,k_in,itr)
- end do
- end select
- end if
- call slope_limiter(nhe,nc,nhc,fcube(:,:,k_in,itr),jx,jy,irecons,recons(:,:,:,itr),&
- spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),&
- recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual)
- end if
- end do
+ if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2')
+
+ call slope_limiter(nhe,nc,nhc,fcube,jx,jy,k_in,nlev_in,ntrac_in,irecons,recons,&
+ spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),&
+ recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual,llimiter,&
+ cubeboundary)
if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#2')
end if
if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#3')
@@ -185,8 +154,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
end do
case(6)
do itr=1,ntrac_in
-! do j=1-nhe,nc+nhe
-! do i=1-nhe,nc+nhe
do in=1,3
do j=jy(1,in),jy(2,in)
do i=jx(1,in),jx(2,in)
@@ -331,31 +298,68 @@ subroutine get_gradients(f,jx,jy,irecons,gradient,rot_matrix,centroid_stretch,nc
end subroutine get_gradients
- subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,recons_metrics,&
- vertex_recons_weights,vtx_cart,irecons_actual)
+ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,spherecentroid,recons_metrics,&
+ vertex_recons_weights,vtx_cart,irecons_actual,llimiter,cubeboundary)
implicit none
- integer , intent(in) :: irecons,nhe,nc,nhc,irecons_actual
- real (kind=r8), dimension(1-nhc:, 1-nhc:) , intent(in) :: fcube
- real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(inout):: recons
+ integer , intent(in) :: irecons_actual,k,nlev,ntrac
+ integer , intent(in) :: irecons,nhe,nc,nhc
+ real (kind=r8), dimension(1-nhc:nc+nhc,1-nhc:nc+nhc,nlev,ntrac) , intent(inout) :: fcube
+ real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe,ntrac), intent(inout):: recons
integer, dimension(2,3) , intent(in) :: jx,jy
real (kind=r8), dimension(irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(in) :: spherecentroid
real (kind=r8), dimension(3,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(in) :: recons_metrics
real (kind=r8), dimension(4,1:irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe), intent(in) :: vertex_recons_weights
real (kind=r8), dimension(4,2,1-nhc:nc+nhc,1-nhc:nc+nhc) , intent(in) :: vtx_cart
+ logical, dimension(ntrac) , intent(in) :: llimiter
+ integer , intent(in) :: cubeboundary
real (kind=r8):: minval_patch,maxval_patch
real (kind=r8):: phi, min_val, max_val,disc
+ real (kind=r8):: v1,v2,v3,v4,vx1,vx2,vx3,vx4,vy1,vy2,vy3,vy4,r2,r3,r4,r5,r6,scx,scy,dx,dy,ex1,ex2,f0,val
+ real (kind=r8):: m1,m2,m3
real (kind=r8):: min_phi
real (kind=r8):: extrema(2), xminmax(2),yminmax(2),extrema_value(13)
real(kind=r8) :: invtmp ! temporary to pre-compute inverses
- integer :: itmp1,itmp2,i,j,in,vertex,n
+ integer :: itmp1,itmp2,i,j,in,vertex,n,itr,h
-! real (kind=r8), dimension(-1:5) :: diff_value
real (kind=r8), dimension(-1:1) :: minval_array, maxval_array
real (kind=r8), parameter :: threshold = 1.0E-40_r8
- character(len=128) :: errormsg
+ character(len=128) :: errormsg
+ integer :: im1,jm1,ip1,jp1
+ !
+ ! fill in non-existent (in physical space) corner values to simplify
+ ! logic in limiter code (min/max operation)
+ !
+ do itr=1,ntrac
+ if (.not. llimiter(itr)) cycle
+ if (cubeboundary>4) then
+ select case(cubeboundary)
+ case (nwest)
+ do h=1,nhe+1
+ fcube(0,nc+h ,k,itr) = fcube(1-h,nc ,k,itr)
+ fcube(1-h,nc+1,k,itr) = fcube(1 ,nc+h,k,itr)
+ end do
+ case (swest)
+ do h=1,nhe+1
+ fcube(1-h,0,k,itr) = fcube(1,1-h,k,itr)
+ fcube(0,1-h,k,itr) = fcube(1-h,1,k,itr)
+ end do
+ case (seast)
+ do h=1,nhe+1
+ fcube(nc+h,0 ,k,itr) = fcube(nc,1-h,k,itr)
+ fcube(nc+1,1-h,k,itr) = fcube(nc+h,1,k,itr)
+ end do
+ case (neast)
+ do h=1,nhe+1
+ fcube(nc+h,nc+1,k,itr) = fcube(nc,nc+h,k,itr)
+ fcube(nc+1,nc+h,k,itr) = fcube(nc+h,nc,k,itr)
+ end do
+ end select
+ end if
+ end do
+
select case (irecons_actual)
!
! PLM limiter
@@ -364,52 +368,51 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re
do in=1,3
do j=jy(1,in),jy(2,in)
do i=jx(1,in),jx(2,in)
- !rck combined min/max and unrolled inner loop
- !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1))
- !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1))
- !DIR$ SIMD
- do itmp2=-1,+1
- itmp1 = j+itmp2
- minval_array(itmp2) = min(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1))
- maxval_array(itmp2) = max(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1))
- enddo
- minval_patch = min(minval_array(-1),minval_array(0),minval_array(1))
- maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1))
-
- min_phi=1.0_r8
-
- !
- ! coordinate bounds (could be pre-computed!)
- !
- xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
- xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
- yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
- yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
-
- !rck restructured loop
- !DIR$ SIMD
- do vertex=1,4
- call recons_val_cart_plm(fcube(i,j), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), &
- recons(1:3,i,j), extrema_value(vertex))
+ do itr = 1, ntrac
+ if (.not. llimiter(itr)) cycle
+ !rck combined min/max and unrolled inner loop
+ !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1))
+ !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1))
+ !DIR$ SIMD
+ do itmp2=-1,+1
+ itmp1 = j+itmp2
+ minval_array(itmp2) = min(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr))
+ maxval_array(itmp2) = max(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr))
+ enddo
+ minval_patch = min(minval_array(-1),minval_array(0),minval_array(1))
+ maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1))
+ min_phi=1.0_r8
+ !
+ ! coordinate bounds (could be pre-computed!)
+ !
+ xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
+ xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
+ yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
+ yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
+ !rck restructured loop
+ !DIR$ SIMD
+ do vertex=1,4
+ call recons_val_cart_plm(fcube(i,j,k,itr), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), &
+ recons(1:3,i,j,itr), extrema_value(vertex))
+ end do
+ max_val = MAXVAL(extrema_value(1:4))
+ min_val = MINVAL(extrema_value(1:4))
+ if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then
+ phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr))
+ if (phithreshold) then
+ phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr))
+ if (phimaxval_patch.and.abs(max_val-fcube(i,j))>threshold) then
- phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j))
- if (phithreshold) then
- phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j))
- if (phi threshold) then
- extrema(1) = recons(6,i,j) * recons(3,i,j) - 2.0_r8 * recons(5,i,j) * recons(2,i,j)
- extrema(2) = recons(6,i,j) * recons(2,i,j) - 2.0_r8 * recons(4,i,j) * recons(3,i,j)
+ vx1 = vtx_cart(1,1,i,j); vy1 = vtx_cart(1,2,i,j)
+ vx2 = vtx_cart(2,1,i,j); vy2 = vtx_cart(2,2,i,j)
+ vx3 = vtx_cart(3,1,i,j); vy3 = vtx_cart(3,2,i,j)
+ vx4 = vtx_cart(4,1,i,j); vy4 = vtx_cart(4,2,i,j)
+ xminmax(1) = min(vx1,vx2,vx3,vx4)
+ xminmax(2) = max(vx1,vx2,vx3,vx4)
+ yminmax(1) = min(vy1,vy2,vy3,vy4)
+ yminmax(2) = max(vy1,vy2,vy3,vy4)
+ scx = spherecentroid(1,i,j)
+ scy = spherecentroid(2,i,j)
+ m1 = recons_metrics(1,i,j)
+ m2 = recons_metrics(2,i,j)
+ m3 = recons_metrics(3,i,j)
+ im1 = i-1; ip1 = i+1
+ jm1 = j-1; jp1 = j+1
+ do itr = 1, ntrac
+ if (.not. llimiter(itr)) cycle
+ !rck combined min/max and unrolled inner loop
+ !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1))
+ !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1))
+ minval_patch = min( min( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), &
+ min( fcube(im1,j ,k,itr), fcube(i ,j ,k,itr), fcube(ip1,j ,k,itr) ), &
+ min( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) )
+ maxval_patch = max( max( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), &
+ max( fcube(im1,j ,k,itr), fcube(i ,j ,k,itr), fcube(ip1,j ,k,itr) ), &
+ max( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) )
+ min_phi=1.0_r8
- disc=1.0_r8/disc
- extrema(1) = extrema(1) * disc + spherecentroid(1,i,j)
- extrema(2) = extrema(2) * disc + spherecentroid(2,i,j)
- if ( (extrema(1) - xminmax(1) > -threshold) .and. & !xmin
- (extrema(1) - xminmax(2) < threshold) .and. & !xmax
- (extrema(2) - yminmax(1) > -threshold) .and. & !ymin
- (extrema(2) - yminmax(2) < threshold)) then !ymax
- call recons_val_cart(fcube(i,j), extrema(1), extrema(2), spherecentroid(:,i,j), &
- recons_metrics(:,i,j), recons(:,i,j), extrema_value(5))
- endif
- endif
- !
- ! Check all potential minimizer points along element boundaries
- !
- if (abs(recons(6,i,j)) > threshold) then
- invtmp = 1.0_r8 / (recons(6,i,j) + spherecentroid(2,i,j))
- do n=1,2
- ! Left edge, intercept with du/dx = 0
- extrema(2) = invtmp * (-recons(2,i,j) - 2.0_r8 * recons(4,i,j) * (xminmax(n) - spherecentroid(1,i,j)))
- if ((extrema(2) > yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then
- call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), &
- recons_metrics(:,i,j), recons(:,i,j), extrema_value(5+n))
- endif
- enddo
- ! Top/bottom edge, intercept with du/dy = 0
- invtmp = 1.0_r8 / recons(6,i,j) + spherecentroid(1,i,j)
- do n = 1,2
- extrema(1) = invtmp * (-recons(3,i,j) - 2.0_r8 * recons(5,i,j) * (yminmax(n) - spherecentroid(2,i,j)))
- if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then
- call recons_val_cart(fcube(i,j), extrema(1), yminmax(n),spherecentroid(:,i,j), &
- recons_metrics(:,i,j), recons(:,i,j), extrema_value(7+n))
- endif
+ f0 = fcube(i,j,k,itr)
+ min_val = f0; max_val = f0!initialize min/max
+ !
+ ! compute min/max value at cell corners
+ !DIR$ SIMD
+ do vertex=1,4
+ val = f0
+ do itmp1=1,irecons-1
+ val = val + recons(itmp1+1,i,j,itr)*vertex_recons_weights(vertex,itmp1,i,j)
+ enddo
+ min_val = min(min_val,val)
+ max_val = max(max_val,val)
enddo
- endif
-
- ! Top/bottom edge, y=const., du/dx=0
- if (abs(recons(4,i,j)) > threshold) then
- invtmp = 1.0_r8 / (2.0_r8 * recons(4,i,j))! + spherecentroid(1,i,j)
- do n = 1,2
- extrema(1) = spherecentroid(1,i,j)+&
- invtmp * (-recons(2,i,j) - recons(6,i,j) * (yminmax(n) - spherecentroid(2,i,j)))
-
- if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then
- call recons_val_cart(fcube(i,j), extrema(1), yminmax(n), spherecentroid(:,i,j),&
- recons_metrics(:,i,j),recons(:,i,j), extrema_value(9+n))
- endif
- enddo
- endif
- ! Left/right edge, x=const., du/dy=0
- if (abs(recons(5,i,j)) > threshold) then
- invtmp = 1.0_r8 / (2.0_r8 * recons(5,i,j))
- do n = 1,2
- extrema(2) = spherecentroid(2,i,j)+&
- invtmp * (-recons(3,i,j) - recons(6,i,j) * (xminmax(n) - spherecentroid(1,i,j)))
-
- if ((extrema(2)>yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then
- call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), &
- recons_metrics(:,i,j), recons(:,i,j), extrema_value(11+n))
- endif
- enddo
- endif
- !rck - combined min/max calculation and unrolled
- ! max_val = MAXVAL(extrema_value)
- ! min_val = MINVAL(extrema_value)
- max_val = extrema_value(13)
- min_val = extrema_value(13)
- do itmp1 = 1,12,4
- max_val = max(max_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3))
- min_val = min(min_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3))
- enddo
- !rck
+ r2 = recons(2,i,j,itr); r3 = recons(3,i,j,itr)
+ r4 = recons(4,i,j,itr); r5 = recons(5,i,j,itr); r6 = recons(6,i,j,itr)
+ ! Check if the quadratic is minimized within the element
+ ! Extrema in the interior of the element (there might be just one candidate)
+ ! DO NOT NEED ABS here, if disc<0 we have a saddle point (no maximum or minimum)
+ disc = 4.0_r8 * r4 *r5 - r6*r6
+ if (abs(disc) > threshold) then
+ ex1 = (r6*r3 - 2.0_r8*r5*r2)
+ ex2 = (r6*r2 - 2.0_r8*r4*r3)
+ disc = 1 /disc
+ ex1 = ex1*disc+scx
+ ex2 = ex2*disc+scy
+ if ( ex1 > xminmax(1)-threshold .and. ex1 < xminmax(2)+threshold .and. &
+ ex2 > yminmax(1)-threshold .and. ex2 < yminmax(2)+threshold ) then
+ dx = ex1 - scx; dy = ex2 - scy
+ v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+ max_val = max(max_val, v1)
+ min_val = min(min_val, v1)
+ end if
+ endif
+ !
+ ! Check all potential minimizer points along element boundaries
+ !
- if (max_val>maxval_patch.and.abs(max_val-fcube(i,j))>threshold) then
- phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j))
- if (phithreshold) then
- phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j))
- if (phi threshold) then
+ invtmp = 1.0_r8 / (2.0_r8 * r4)
+ do n = 1,2
+ ex1 = scx+invtmp * (-r2 - r6 * (yminmax(n) - scy))
+ if ((ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold)) then
+ dx = ex1 - scx; dy = yminmax(n) - scy
+ v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+ max_val = max(max_val, v1)
+ min_val = min(min_val, v1)
+ endif
+ enddo
+ endif
+ !
+ ! Left/right edge, x=const., du/dy=0
+ !
+ if (abs(r5) > threshold) then
+ invtmp = 1.0_r8 / (2.0_r8 * r5)
+ do n = 1,2
+ ex1 = scy+invtmp * (-r3 - r6 * (xminmax(n) - scx))
+ if ((ex1 > yminmax(1)-threshold) .and. (ex1 < yminmax(2)+threshold)) then
+ dx = xminmax(n) - scx; dy = ex1 - scy
+ v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+ max_val = max(max_val, v1)
+ min_val = min(min_val, v1)
+ endif
+ enddo
+ endif
+ !
+ if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then
+ phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr))
+ if (phithreshold) then
+ phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr))
+ if (phi