diff --git a/cime_config/config_pes.xml b/cime_config/config_pes.xml index 7b9b5793f2..7c8c2b95b8 100644 --- a/cime_config/config_pes.xml +++ b/cime_config/config_pes.xml @@ -572,43 +572,6 @@ - - - - none - - 64 - 64 - 64 - 64 - 64 - 64 - 64 - 64 - - - 1 - 1 - 1 - 1 - 1 - 1 - 1 - 1 - - - 0 - 0 - 0 - 0 - 0 - 0 - 0 - 0 - - - - diff --git a/doc/ChangeLog b/doc/ChangeLog index 980f22d746..897856ed6d 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,3 +1,86 @@ +=============================================================== + +Tag name: cam6_4_131 +Originator(s): johnmauff, pel, cacraig, nusbaume +Date: Nov 26, 2025 +One-line Summary: Performance improvements for CSLAM +Github PR URL: https://github.com/ESCOMP/CAM/pull/1365 + +Purpose of changes (include the issue number and title text for each relevant GitHub issue): + Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1360 + + The subroutine extend_panel_interpolate is written such that the compiler will generate more data movement than is necessary. + This excessive data movement intensifies a computational load imbalance in the CSLAM advection. While it is impossible to eliminate + the load imbalance that is caused by the special treatment of panels at the corners of the cubed sphere, it is possible to reduce + the cost of this subroutine by changing the way that the subroutine is written. + +Describe any changes made to build system: N/A + +Describe any changes made to the namelist: N/A + +List any changes to the defaults for the boundary datasets: N/A + +Describe any substantial timing or memory changes: N/A + +Code reviewed by: nusbaume + +List all files eliminated: N/A + +List all files added and what they do: N/A + +List all existing files that have been modified, and describe the changes: + +M cime_config/config_pes.xml + - remove dead config code originally used by Eulerian dycore + +M src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 +M src/dynamics/se/dycore/fvm_reconstruction_mod.F90 + - mods for SE dycore as described above + +If there were any failures reported from running test_driver.sh on any test +platform, and checkin with these failures has been OK'd by the gatekeeper, +then copy the lines from the td.*.status files for the failed tests to the +appropriate machine below. All failed tests must be justified. + +derecho/intel/aux_cam: + SMS_D_Ln9_P1536x1.ne0CONUSne30x8_ne0CONUSne30x8_mt12.FCHIST.derecho_intel.cam-outfrq9s (Overall: FAIL) details: + - intermittent failure in CTSM code (lnd_set_decomp_and_domain.F90) + + ERC_D_Ln9.ne30pg2_ne30pg2_mt232.QPC7.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERC_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_LTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERI_D_Ln18.ne16pg3_ne16pg3_mt232.FHIST_C4.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details: + ERI_D_Ln18.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details: + ERP_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERP_Ld3.ne16pg3_ne16pg3_mg17.FHISTC_WAt1ma.derecho_intel.cam-reduced_hist1d (Overall: DIFF) details: + ERP_Ld3.ne30pg3_ne30pg3_mt232.FHISTC_MTt4s.derecho_intel.cam-outfrq1d_aoa (Overall: DIFF) details: + ERP_Ln9.ne30pg3_ne30pg3_mg17.FCnudged.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERP_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WAma.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + ERR_Ln9.ne16pg3_ne16pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq9s_bwic (Overall: DIFF) details: + ERS_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s_ctem (Overall: DIFF) details: + SMS_C2_D_Ln9.ne16pg3_ne16pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + SMS_D_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details: + SMS_D_Ln9_P1280x1.ne30pg3_ne30pg3_mt232.FHISTC_MTt1s.derecho_intel.cam-outfrq9s_Leung_dust (Overall: DIFF) details: + SMS_Ld1.ne30pg3_ne30pg3_mg17.FC2010climo.derecho_intel.cam-outfrq1d (Overall: DIFF) details: + SMS_Ln9.ne30pg3_ne30pg3_mg17.FW2000climo.derecho_intel.cam-outfrq9s_rrtmgp (Overall: DIFF) details: + - answer differences for CSLAM runs + +derecho/nvhpc/aux_cam: + ERS_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_nvhpc.cam-outfrq9s_gpu_default (Overall: FAIL) details: + - timing issue - Jian has determined this is due to changes on derecho with the the last upgrade. He has + reported the issue to CISL. Note cam6_4_128 is the last CAM tag with baselines to use for comparison + but answer changes are expected starting with cam6_4_130 + +izumi/nag/aux_cam: + ERC_D_Ln27.ne3pg3_ne3pg3_mt232.FKESSLER.izumi_nag.cam-outfrq9s (Overall: DIFF) details: + ERC_D_Ln9.ne3pg3_ne3pg3_mt232.FHISTC_LTso.izumi_nag.cam-cosp_rad_diags (Overall: DIFF) details: + SMS_D_Ln3.ne5pg3_ne5pg3_mg37.QPX2000.izumi_nag.cam-outfrq3s (Overall: DIFF) details: + - answer differences for CSLAM runs + + +izumi/gnu/aux_cam: all BFB + +Summarize any changes to answers: + Answer changing, bug not climate changing as reported by pel =============================================================== diff --git a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 index d3becc8446..3e56498b38 100644 --- a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 +++ b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 @@ -1,3 +1,4 @@ +#define FVM_TIMERS .FALSE. module fvm_consistent_se_cslam use shr_kind_mod, only: r8=>shr_kind_r8 use dimensions_mod, only: nc, nhe, nlev, ntrac, np, nhr, nhc, ngpc, ns, nht @@ -107,7 +108,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& endif kblk = kmax-kmin+1 - !call t_startf('fvm:before_Qnhc') + if(FVM_TIMERS) call t_startf('fvm:before_Qnhc') do ie=nets,nete do k=kmin,kmax elem(ie)%sub_elem_mass_flux(:,:,:,k) = dt_fvm*elem(ie)%sub_elem_mass_flux(:,:,:,k)*fvm(ie)%dp_ref_inverse(k) @@ -120,11 +121,11 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& call ghostpack(ghostbufQnhc,fvm(ie)%c(1-nhc:nc+nhc,1-nhc:nc+nhc,kmin:kmax,q),kblk,kptr,ie) enddo end do - !call t_stopf('fvm:before_Qnhc') - !call t_startf('fvm:ghost_exchange:Qnhc') + if(FVM_TIMERS) call t_stopf('fvm:before_Qnhc') + if(FVM_TIMERS) call t_startf('fvm:ghost_exchange:Qnhc') call ghost_exchange(hybridnew,ghostbufQnhc,location='ghostbufQnhc') - !call t_stopf('fvm:ghost_exchange:Qnhc') - !call t_startf('fvm:orthogonal_swept_areas') + if(FVM_TIMERS) call t_stopf('fvm:ghost_exchange:Qnhc') + if(FVM_TIMERS) call t_startf('fvm:orthogonal_swept_areas') do ie=nets,nete do k=kmin,kmax fvm(ie)%se_flux (1:nc,1:nc,:,k) = elem(ie)%sub_elem_mass_flux(:,:,:,k) @@ -152,14 +153,14 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& end do enddo - !call t_stopf('fvm:orthogonal_swept_areas') + if(FVM_TIMERS) call t_stopf('fvm:orthogonal_swept_areas') do ie=nets,nete ! Intel compiler version 2023.0.0 on derecho had significant slowdown on subroutine interface without ! these pointers. fcube => fvm(ie)%c(:,:,:,:) spherecentroid => fvm(ie)%spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe) do k=kmin,kmax - !call t_startf('FVM:tracers_reconstruct') + if(FVM_TIMERS) call t_startf('FVM:tracers_reconstruct') call reconstruction(fcube,nlev,k,& ctracer(:,:,:,:),irecons_tracer,llimiter,ntrac,& nc,nhe,nhr,nhc,nht,ns,nhr+(nhe-1),& @@ -170,10 +171,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& fvm(ie)%rot_matrix,fvm(ie)%centroid_stretch,& fvm(ie)%vertex_recons_weights,fvm(ie)%vtx_cart,& irecons_tracer_lev(k)) - !call t_stopf('FVM:tracers_reconstruct') - !call t_startf('fvm:swept_flux') + if(FVM_TIMERS) call t_stopf('FVM:tracers_reconstruct') + if(FVM_TIMERS) call t_startf('fvm:swept_flux') call swept_flux(elem(ie),fvm(ie),k,ctracer,irecons_tracer_lev(k),gsweights,gspts) - !call t_stopf('fvm:swept_flux') + if(FVM_TIMERS) call t_stopf('fvm:swept_flux') end do end do ! @@ -193,7 +194,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& ! ! if (large_Courant_incr) then - !call t_startf('fvm:fill_halo_fvm:large_Courant') + if(FVM_TIMERS) call t_startf('fvm:fill_halo_fvm:large_Courant') !if (kmin_jetkmax) then ! call endrun('ERROR: kmax_jet must be .le. kmax passed to run_consistent_se_cslam') !end if @@ -203,8 +204,8 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& kmax_jet_local = min(kmax_jet,kmax) klev = kmax_jet-kmin_jet+1 call fill_halo_fvm(ghostbufQ1,elem,fvm,hybridnew,nets,nete,1,kmin_jet_local,kmax_jet_local,klev,active=ActiveJetThread) - !call t_stopf('fvm:fill_halo_fvm:large_Courant') - !call t_startf('fvm:large_Courant_number_increment') + if(FVM_TIMERS) call t_stopf('fvm:fill_halo_fvm:large_Courant') + if(FVM_TIMERS) call t_startf('fvm:large_Courant_number_increment') if(ActiveJetThread) then do k=kmin_jet_local,kmax_jet_local !1,nlev do ie=nets,nete @@ -212,10 +213,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& end do end do endif - !call t_stopf('fvm:large_Courant_number_increment') + if(FVM_TIMERS) call t_stopf('fvm:large_Courant_number_increment') end if - !call t_startf('fvm:end_of_reconstruct_subroutine') + if(FVM_TIMERS) call t_startf('fvm:end_of_reconstruct_subroutine') do k=kmin,kmax ! ! convert to mixing ratio @@ -251,7 +252,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,& elem(ie)%sub_elem_mass_flux(:,:,:,k)=0 end do end do - !call t_stopf('fvm:end_of_reconstruct_subroutine') + if(FVM_TIMERS) call t_stopf('fvm:end_of_reconstruct_subroutine') !$OMP END PARALLEL call omp_set_nested(.false.) end subroutine run_consistent_se_cslam @@ -281,7 +282,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt REAL(KIND=r8), dimension(2,8) :: x_start, dgam_vec REAL(KIND=r8) :: gamma_max, displ_first_guess - REAL(KIND=r8) :: flux,flux_tracer(ntrac) + REAL(KIND=r8) :: flux,flux_tracer(ntrac),w REAL(KIND=r8), dimension(num_area) :: dp_area @@ -306,7 +307,6 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! ! prepare for air/tracer update ! -! dp = fvm%dp_fvm(1-nhe:nc+nhe,1-nhe:nc+nhe,ilev) dp = fvm%dp_fvm(1-nhc:nc+nhc,1-nhc:nc+nhc,ilev) fvm%dp_fvm(1:nc,1:nc,ilev) = fvm%dp_fvm(1:nc,1:nc,ilev)*fvm%area_sphere do itr=1,ntrac @@ -538,14 +538,14 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! ! iterate to get flux area ! - !call t_startf('fvm:swept_area:get_gamma') + if(FVM_TIMERS) call t_startf('fvm:swept_area:get_gamma') do iarea=1,num_area dp_area(iarea) = dp(idx(1,iarea,i,j,iside),idx(2,iarea,i,j,iside)) end do call get_flux_segments_area_iterate(x,x_static,dx_static,dx,x_start,dgam_vec,num_seg,num_seg_static,& num_seg_max,num_area,dp_area,flowcase,gamma,mass_flux_se(i,j,iside),0.0_r8,gamma_max, & gsweights,gspts,ilev) - !call t_stopf('fvm:swept_area:get_gamma') + if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_gamma') ! ! pack segments for high-order weights computation ! @@ -560,10 +560,10 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! ! compute higher-order weights ! - !call t_startf('fvm:swept_area:get_high_order_w') + if(FVM_TIMERS) call t_startf('fvm:swept_area:get_high_order_w') call get_high_order_weights_over_areas(x,dx,num_seg,num_seg_max,num_area,weights,ngpc,& gsweights, gspts,irecons_tracer) - !call t_stopf('fvm:swept_area:get_high_order_w') + if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_high_order_w') ! !************************************************** ! @@ -571,16 +571,17 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt ! !************************************************** ! - !call t_startf('fvm:swept_area:remap') + if(FVM_TIMERS) call t_startf('fvm:swept_area:remap') flux=0.0_r8; flux_tracer=0.0_r8 do iarea=1,num_area if (num_seg(iarea)>0) then ii=idx(1,iarea,i,j,iside); jj=idx(2,iarea,i,j,iside) flux=flux+weights(1,iarea)*dp(ii,jj) - do itr=1,ntrac - do iw=1,irecons_tracer_actual - flux_tracer(itr) = flux_tracer(itr)+weights(iw,iarea)*ctracer(iw,ii,jj,itr) - end do + do iw=1,irecons_tracer_actual + w = weights(iw,iarea) + do itr=1,ntrac + flux_tracer(itr) = flux_tracer(itr)+w*ctracer(iw,ii,jj,itr) + end do end do end if end do @@ -614,7 +615,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt fvm%dp_fvm(i-1,j,ilev ) = fvm%dp_fvm(i-1,j,ilev )+flux fvm% c(i-1,j,ilev,1:ntrac) = fvm% c(i-1,j,ilev,1:ntrac)+flux_tracer(1:ntrac) end if - !call t_stopf('fvm:swept_area:remap') + if(FVM_TIMERS) call t_stopf('fvm:swept_area:remap') end if end do end do diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 index b4708dfd3b..8b425042be 100644 --- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 +++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 @@ -79,7 +79,7 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& real (kind=r8), dimension(1-nht:nc+nht,1-nht:nc+nht,3) :: f - integer :: i,j,in,h,itr + integer :: i,j,in,h,itr,k integer, dimension(2,3) :: jx,jy if (present(irecons_actual_in)) then @@ -87,7 +87,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& else irecons_actual = irecons end if - jx(1,1)=jx_min(1); jx(2,1)=jx_max(1)-1 jx(1,2)=jx_min(2); jx(2,2)=jx_max(2)-1 @@ -119,42 +118,12 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& end do end if if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#1') - if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2') - ! - ! fill in non-existent (in physical space) corner values to simplify - ! logic in limiter code (min/max operation) - ! - do itr=1,ntrac_in - if (llimiter(itr)) then - if (cubeboundary>4) then - select case(cubeboundary) - case (nwest) - do h=1,nhe+1 - fcube(0,nc+h ,k_in,itr) = fcube(1-h,nc ,k_in,itr) - fcube(1-h,nc+1,k_in,itr) = fcube(1 ,nc+h,k_in,itr) - end do - case (swest) - do h=1,nhe+1 - fcube(1-h,0,k_in,itr) = fcube(1,1-h,k_in,itr) - fcube(0,1-h,k_in,itr) = fcube(1-h,1,k_in,itr) - end do - case (seast) - do h=1,nhe+1 - fcube(nc+h,0 ,k_in,itr) = fcube(nc,1-h,k_in,itr) - fcube(nc+1,1-h,k_in,itr) = fcube(nc+h,1,k_in,itr) - end do - case (neast) - do h=1,nhe+1 - fcube(nc+h,nc+1,k_in,itr) = fcube(nc,nc+h,k_in,itr) - fcube(nc+1,nc+h,k_in,itr) = fcube(nc+h,nc,k_in,itr) - end do - end select - end if - call slope_limiter(nhe,nc,nhc,fcube(:,:,k_in,itr),jx,jy,irecons,recons(:,:,:,itr),& - spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),& - recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual) - end if - end do + if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2') + + call slope_limiter(nhe,nc,nhc,fcube,jx,jy,k_in,nlev_in,ntrac_in,irecons,recons,& + spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),& + recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual,llimiter,& + cubeboundary) if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#2') end if if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#3') @@ -185,8 +154,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,& end do case(6) do itr=1,ntrac_in -! do j=1-nhe,nc+nhe -! do i=1-nhe,nc+nhe do in=1,3 do j=jy(1,in),jy(2,in) do i=jx(1,in),jx(2,in) @@ -331,31 +298,68 @@ subroutine get_gradients(f,jx,jy,irecons,gradient,rot_matrix,centroid_stretch,nc end subroutine get_gradients - subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,recons_metrics,& - vertex_recons_weights,vtx_cart,irecons_actual) + subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,spherecentroid,recons_metrics,& + vertex_recons_weights,vtx_cart,irecons_actual,llimiter,cubeboundary) implicit none - integer , intent(in) :: irecons,nhe,nc,nhc,irecons_actual - real (kind=r8), dimension(1-nhc:, 1-nhc:) , intent(in) :: fcube - real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(inout):: recons + integer , intent(in) :: irecons_actual,k,nlev,ntrac + integer , intent(in) :: irecons,nhe,nc,nhc + real (kind=r8), dimension(1-nhc:nc+nhc,1-nhc:nc+nhc,nlev,ntrac) , intent(inout) :: fcube + real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe,ntrac), intent(inout):: recons integer, dimension(2,3) , intent(in) :: jx,jy real (kind=r8), dimension(irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(in) :: spherecentroid real (kind=r8), dimension(3,1-nhe:nc+nhe,1-nhe:nc+nhe) , intent(in) :: recons_metrics real (kind=r8), dimension(4,1:irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe), intent(in) :: vertex_recons_weights real (kind=r8), dimension(4,2,1-nhc:nc+nhc,1-nhc:nc+nhc) , intent(in) :: vtx_cart + logical, dimension(ntrac) , intent(in) :: llimiter + integer , intent(in) :: cubeboundary real (kind=r8):: minval_patch,maxval_patch real (kind=r8):: phi, min_val, max_val,disc + real (kind=r8):: v1,v2,v3,v4,vx1,vx2,vx3,vx4,vy1,vy2,vy3,vy4,r2,r3,r4,r5,r6,scx,scy,dx,dy,ex1,ex2,f0,val + real (kind=r8):: m1,m2,m3 real (kind=r8):: min_phi real (kind=r8):: extrema(2), xminmax(2),yminmax(2),extrema_value(13) real(kind=r8) :: invtmp ! temporary to pre-compute inverses - integer :: itmp1,itmp2,i,j,in,vertex,n + integer :: itmp1,itmp2,i,j,in,vertex,n,itr,h -! real (kind=r8), dimension(-1:5) :: diff_value real (kind=r8), dimension(-1:1) :: minval_array, maxval_array real (kind=r8), parameter :: threshold = 1.0E-40_r8 - character(len=128) :: errormsg + character(len=128) :: errormsg + integer :: im1,jm1,ip1,jp1 + ! + ! fill in non-existent (in physical space) corner values to simplify + ! logic in limiter code (min/max operation) + ! + do itr=1,ntrac + if (.not. llimiter(itr)) cycle + if (cubeboundary>4) then + select case(cubeboundary) + case (nwest) + do h=1,nhe+1 + fcube(0,nc+h ,k,itr) = fcube(1-h,nc ,k,itr) + fcube(1-h,nc+1,k,itr) = fcube(1 ,nc+h,k,itr) + end do + case (swest) + do h=1,nhe+1 + fcube(1-h,0,k,itr) = fcube(1,1-h,k,itr) + fcube(0,1-h,k,itr) = fcube(1-h,1,k,itr) + end do + case (seast) + do h=1,nhe+1 + fcube(nc+h,0 ,k,itr) = fcube(nc,1-h,k,itr) + fcube(nc+1,1-h,k,itr) = fcube(nc+h,1,k,itr) + end do + case (neast) + do h=1,nhe+1 + fcube(nc+h,nc+1,k,itr) = fcube(nc,nc+h,k,itr) + fcube(nc+1,nc+h,k,itr) = fcube(nc+h,nc,k,itr) + end do + end select + end if + end do + select case (irecons_actual) ! ! PLM limiter @@ -364,52 +368,51 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re do in=1,3 do j=jy(1,in),jy(2,in) do i=jx(1,in),jx(2,in) - !rck combined min/max and unrolled inner loop - !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1)) - !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1)) - !DIR$ SIMD - do itmp2=-1,+1 - itmp1 = j+itmp2 - minval_array(itmp2) = min(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1)) - maxval_array(itmp2) = max(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1)) - enddo - minval_patch = min(minval_array(-1),minval_array(0),minval_array(1)) - maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1)) - - min_phi=1.0_r8 - - ! - ! coordinate bounds (could be pre-computed!) - ! - xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) - xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) - yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) - yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) - - !rck restructured loop - !DIR$ SIMD - do vertex=1,4 - call recons_val_cart_plm(fcube(i,j), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), & - recons(1:3,i,j), extrema_value(vertex)) + do itr = 1, ntrac + if (.not. llimiter(itr)) cycle + !rck combined min/max and unrolled inner loop + !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1)) + !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1)) + !DIR$ SIMD + do itmp2=-1,+1 + itmp1 = j+itmp2 + minval_array(itmp2) = min(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr)) + maxval_array(itmp2) = max(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr)) + enddo + minval_patch = min(minval_array(-1),minval_array(0),minval_array(1)) + maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1)) + min_phi=1.0_r8 + ! + ! coordinate bounds (could be pre-computed!) + ! + xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) + xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j)) + yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) + yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j)) + !rck restructured loop + !DIR$ SIMD + do vertex=1,4 + call recons_val_cart_plm(fcube(i,j,k,itr), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), & + recons(1:3,i,j,itr), extrema_value(vertex)) + end do + max_val = MAXVAL(extrema_value(1:4)) + min_val = MINVAL(extrema_value(1:4)) + if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then + phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr)) + if (phithreshold) then + phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr)) + if (phimaxval_patch.and.abs(max_val-fcube(i,j))>threshold) then - phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j)) - if (phithreshold) then - phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j)) - if (phi threshold) then - extrema(1) = recons(6,i,j) * recons(3,i,j) - 2.0_r8 * recons(5,i,j) * recons(2,i,j) - extrema(2) = recons(6,i,j) * recons(2,i,j) - 2.0_r8 * recons(4,i,j) * recons(3,i,j) + vx1 = vtx_cart(1,1,i,j); vy1 = vtx_cart(1,2,i,j) + vx2 = vtx_cart(2,1,i,j); vy2 = vtx_cart(2,2,i,j) + vx3 = vtx_cart(3,1,i,j); vy3 = vtx_cart(3,2,i,j) + vx4 = vtx_cart(4,1,i,j); vy4 = vtx_cart(4,2,i,j) + xminmax(1) = min(vx1,vx2,vx3,vx4) + xminmax(2) = max(vx1,vx2,vx3,vx4) + yminmax(1) = min(vy1,vy2,vy3,vy4) + yminmax(2) = max(vy1,vy2,vy3,vy4) + scx = spherecentroid(1,i,j) + scy = spherecentroid(2,i,j) + m1 = recons_metrics(1,i,j) + m2 = recons_metrics(2,i,j) + m3 = recons_metrics(3,i,j) + im1 = i-1; ip1 = i+1 + jm1 = j-1; jp1 = j+1 + do itr = 1, ntrac + if (.not. llimiter(itr)) cycle + !rck combined min/max and unrolled inner loop + !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1)) + !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1)) + minval_patch = min( min( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), & + min( fcube(im1,j ,k,itr), fcube(i ,j ,k,itr), fcube(ip1,j ,k,itr) ), & + min( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) ) + maxval_patch = max( max( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), & + max( fcube(im1,j ,k,itr), fcube(i ,j ,k,itr), fcube(ip1,j ,k,itr) ), & + max( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) ) + min_phi=1.0_r8 - disc=1.0_r8/disc - extrema(1) = extrema(1) * disc + spherecentroid(1,i,j) - extrema(2) = extrema(2) * disc + spherecentroid(2,i,j) - if ( (extrema(1) - xminmax(1) > -threshold) .and. & !xmin - (extrema(1) - xminmax(2) < threshold) .and. & !xmax - (extrema(2) - yminmax(1) > -threshold) .and. & !ymin - (extrema(2) - yminmax(2) < threshold)) then !ymax - call recons_val_cart(fcube(i,j), extrema(1), extrema(2), spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(5)) - endif - endif - ! - ! Check all potential minimizer points along element boundaries - ! - if (abs(recons(6,i,j)) > threshold) then - invtmp = 1.0_r8 / (recons(6,i,j) + spherecentroid(2,i,j)) - do n=1,2 - ! Left edge, intercept with du/dx = 0 - extrema(2) = invtmp * (-recons(2,i,j) - 2.0_r8 * recons(4,i,j) * (xminmax(n) - spherecentroid(1,i,j))) - if ((extrema(2) > yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(5+n)) - endif - enddo - ! Top/bottom edge, intercept with du/dy = 0 - invtmp = 1.0_r8 / recons(6,i,j) + spherecentroid(1,i,j) - do n = 1,2 - extrema(1) = invtmp * (-recons(3,i,j) - 2.0_r8 * recons(5,i,j) * (yminmax(n) - spherecentroid(2,i,j))) - if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), extrema(1), yminmax(n),spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(7+n)) - endif + f0 = fcube(i,j,k,itr) + min_val = f0; max_val = f0!initialize min/max + ! + ! compute min/max value at cell corners + !DIR$ SIMD + do vertex=1,4 + val = f0 + do itmp1=1,irecons-1 + val = val + recons(itmp1+1,i,j,itr)*vertex_recons_weights(vertex,itmp1,i,j) + enddo + min_val = min(min_val,val) + max_val = max(max_val,val) enddo - endif - - ! Top/bottom edge, y=const., du/dx=0 - if (abs(recons(4,i,j)) > threshold) then - invtmp = 1.0_r8 / (2.0_r8 * recons(4,i,j))! + spherecentroid(1,i,j) - do n = 1,2 - extrema(1) = spherecentroid(1,i,j)+& - invtmp * (-recons(2,i,j) - recons(6,i,j) * (yminmax(n) - spherecentroid(2,i,j))) - - if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), extrema(1), yminmax(n), spherecentroid(:,i,j),& - recons_metrics(:,i,j),recons(:,i,j), extrema_value(9+n)) - endif - enddo - endif - ! Left/right edge, x=const., du/dy=0 - if (abs(recons(5,i,j)) > threshold) then - invtmp = 1.0_r8 / (2.0_r8 * recons(5,i,j)) - do n = 1,2 - extrema(2) = spherecentroid(2,i,j)+& - invtmp * (-recons(3,i,j) - recons(6,i,j) * (xminmax(n) - spherecentroid(1,i,j))) - - if ((extrema(2)>yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then - call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), & - recons_metrics(:,i,j), recons(:,i,j), extrema_value(11+n)) - endif - enddo - endif - !rck - combined min/max calculation and unrolled - ! max_val = MAXVAL(extrema_value) - ! min_val = MINVAL(extrema_value) - max_val = extrema_value(13) - min_val = extrema_value(13) - do itmp1 = 1,12,4 - max_val = max(max_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3)) - min_val = min(min_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3)) - enddo - !rck + r2 = recons(2,i,j,itr); r3 = recons(3,i,j,itr) + r4 = recons(4,i,j,itr); r5 = recons(5,i,j,itr); r6 = recons(6,i,j,itr) + ! Check if the quadratic is minimized within the element + ! Extrema in the interior of the element (there might be just one candidate) + ! DO NOT NEED ABS here, if disc<0 we have a saddle point (no maximum or minimum) + disc = 4.0_r8 * r4 *r5 - r6*r6 + if (abs(disc) > threshold) then + ex1 = (r6*r3 - 2.0_r8*r5*r2) + ex2 = (r6*r2 - 2.0_r8*r4*r3) + disc = 1 /disc + ex1 = ex1*disc+scx + ex2 = ex2*disc+scy + if ( ex1 > xminmax(1)-threshold .and. ex1 < xminmax(2)+threshold .and. & + ex2 > yminmax(1)-threshold .and. ex2 < yminmax(2)+threshold ) then + dx = ex1 - scx; dy = ex2 - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + end if + endif + ! + ! Check all potential minimizer points along element boundaries + ! - if (max_val>maxval_patch.and.abs(max_val-fcube(i,j))>threshold) then - phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j)) - if (phithreshold) then - phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j)) - if (phi threshold) then + invtmp = 1.0_r8 / (2.0_r8 * r4) + do n = 1,2 + ex1 = scx+invtmp * (-r2 - r6 * (yminmax(n) - scy)) + if ((ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold)) then + dx = ex1 - scx; dy = yminmax(n) - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + endif + enddo + endif + ! + ! Left/right edge, x=const., du/dy=0 + ! + if (abs(r5) > threshold) then + invtmp = 1.0_r8 / (2.0_r8 * r5) + do n = 1,2 + ex1 = scy+invtmp * (-r3 - r6 * (xminmax(n) - scx)) + if ((ex1 > yminmax(1)-threshold) .and. (ex1 < yminmax(2)+threshold)) then + dx = xminmax(n) - scx; dy = ex1 - scy + v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy) + max_val = max(max_val, v1) + min_val = min(min_val, v1) + endif + enddo + endif + ! + if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then + phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr)) + if (phithreshold) then + phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr)) + if (phi