From 6c243ff6c2786557d0d7a5f63f513587d8446c24 Mon Sep 17 00:00:00 2001
From: Peter Hjort Lauritzen <pel@ucar.edu>
Date: Mon, 18 Aug 2025 08:06:07 -0600
Subject: [PATCH 1/9] performance improvement from John Dennis (CISL)

---
 .../se/dycore/fvm_reconstruction_mod.F90      | 167 ++++++++++--------
 1 file changed, 89 insertions(+), 78 deletions(-)

diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
index b4708dfd3b..fad8f41cb5 100644
--- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
@@ -151,7 +151,7 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
                 end select
              end if
              call slope_limiter(nhe,nc,nhc,fcube(:,:,k_in,itr),jx,jy,irecons,recons(:,:,:,itr),&
-                  spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),&
+                  spherecentroid,&
                   recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual)
           end if
        end do
@@ -356,6 +356,7 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re
     real (kind=r8), dimension(-1:1) :: minval_array, maxval_array
     real (kind=r8), parameter :: threshold = 1.0E-40_r8
     character(len=128)        :: errormsg 
+
     select case (irecons_actual)
        !
        ! PLM limiter
@@ -564,6 +565,7 @@ end subroutine slope_limiter
   !        recons ...  array of reconstructed coefficients                            !
   ! OUTPUT: value ... evaluation at a given point                                     !
   !-----------------------------------------------------------------------------------!
+  !DIR$ ATTRIBUTES FORCEINLINE :: recons_val_cart
   subroutine recons_val_cart(fcube, cartx, carty, centroid, pre_computed_metrics, recons, value)
     implicit none
     real(kind=r8), intent(in) :: fcube
@@ -586,6 +588,7 @@ subroutine recons_val_cart(fcube, cartx, carty, centroid, pre_computed_metrics,
          recons(6) * (pre_computed_metrics(3) + dx*dy)
   END subroutine recons_val_cart
 
+    !DIR$ ATTRIBUTES FORCEINLINE :: recons_val_cart_plm
     subroutine recons_val_cart_plm(fcube, cartx, carty, centroid, recons, value)
     implicit none
     real(kind=r8), intent(in) :: fcube
@@ -640,43 +643,62 @@ subroutine slopelimiter_val(value, cell_value, local_min, local_max, min_phi)
   end subroutine slopelimiter_val
   !END SUBROUTINE SLOPELIMITER_VAL------------------------------------------CE-for FVM!
 
-  function matmul_w(w,f,ns)
+  !DIR$ ATTRIBUTES FORCEINLINE :: dotproduct
+  function dotproduct(w,f,ns)
+    implicit none
+    real (kind=r8)                          :: dotproduct
+    real (kind=r8),dimension(:), intent(in) :: w,f      !dimension(ns)
+    integer,                     intent(in) :: ns
+    integer                                 :: k
+
+    if(ns==3) then
+      dotproduct = DotProduct_3(w,f)
+    else
+      dotproduct = DotProduct_gen(w,f,ns)
+    endif
+
+  end function dotproduct
+
+  !DIR$ ATTRIBUTES FORCEINLINE :: DotProduct_gen
+  function DotProduct_gen(w,f,ns)
     implicit none
-    real (kind=r8)                          :: matmul_w
+    real (kind=r8)                          :: DotProduct_gen
     real (kind=r8),dimension(:), intent(in) :: w,f      !dimension(ns)
     integer,                     intent(in) :: ns
     integer                                 :: k
-    matmul_w = 0.0_r8
+    DotProduct_gen = 0.0_r8
     do k=1,ns
-      matmul_w = matmul_w+w(k)*f(k)
+       DotProduct_gen = DotProduct_gen+w(k)*f(k)
     end do
-  end function matmul_w
+  end function DotProduct_gen
 
   ! special hard-coded version of the function where ns=3
   ! for performance optimization
-!  function matmul_w(w, f)
-!    IMPLICIT NONE
-!    REAL(KIND=r8), dimension(3), intent(in) :: w
-!    REAL(KIND=r8), dimension(3), intent(in) :: f
-!    REAL(KIND=r8) :: matmul_w
-!    matmul_w = w(1)*f(1) + w(2)*f(2) + w(3)*f(3)
-!  end function matmul_w
-
-  subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo_interp_weight,ibase,&
+  !DIR$ ATTRIBUTES FORCEINLINE :: DotProduct_3
+  function DotProduct_3(w, f)
+    IMPLICIT NONE
+    REAL(KIND=r8), dimension(3), intent(in) :: w
+    REAL(KIND=r8), dimension(3), intent(in) :: f
+    REAL(KIND=r8) :: DotProduct_3
+    DotProduct_3 = w(1)*f(1) + w(2)*f(2) + w(3)*f(3)
+  end function DotProduct_3
+
+  subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWeight,ibase,&
        fpanel,fotherpanel)
     implicit none
     integer, intent(in) :: cubeboundary,nc,nhr,nht,nh,nhc,ns
     real (kind=r8),   &
          dimension(1-nhc:nc+nhc, 1-nhc:nc+nhc), intent(in)          :: fcube
 
-    real (kind=r8), intent(in) :: halo_interp_weight(1:ns,1-nh:nc+nh,1:nhr,2)
+    real (kind=r8), intent(in) :: hWeight(1:ns,1-nh:nc+nh,1:nhr,2)
     integer              , intent(in) :: ibase(1-nh:nc+nh,1:nhr,2)
 
     real (kind=r8)  , dimension(1-nht:nc+nht, 1-nht:nc+nht ), intent(out)           :: fpanel
     real   (kind=r8), dimension(1-nht:nc+nht,1-nht:nc+nht,2), intent(out), optional :: fotherpanel
 
     integer :: i, halo,ibaseref
-    real (kind=r8), dimension(1:ns,1-nh:nc+nh,1:nhr) :: w
+
+    real (kind=r8), dimension(1-nhc:nc+nhc) :: ftmp
     !
     !  fpanel = 1.0E19 !dbg
     !
@@ -783,12 +805,11 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       !
       ! fill in values that are on the west panels projection
       !
-      w = halo_interp_weight(:,:,:,1)
       do halo=1,nhr
+        ftmp(:) = fcube(1-halo,:)   ! copy to a temporary
         do i=halo-nh,nc+nh-(halo-1)
           ibaseref=ibase(i,halo,1)
-          !           ibaseref = ibase(i,halo,1)
-          fpanel(1-halo ,i) = matmul_w(w(:,i,halo),fcube(1-halo ,ibaseref:ibaseref+ns-1),ns)
+          fpanel(1-halo ,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
         end do
       end do
 
@@ -799,12 +820,13 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
         fotherpanel (1-nht:0,1-nht:nc+nht,1)=fcube(1-nht:0,1-nht:nc+nht)
         !
         do halo=1,nhr
+          ftmp(:) = fcube(halo,:)   ! copy to a temporary
           do i=halo-nh,nc+nh-(halo-1)
             ibaseref=ibase(i,halo,1)
             !
             ! Exploit symmetry in interpolation weights
             !
-            fotherpanel(halo,i,1)     = matmul_w(w(:,i,halo),fcube(halo   ,ibaseref:ibaseref+ns-1),ns)
+            fotherpanel(halo,i,1)     = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
           end do
         end do
       end if
@@ -851,21 +873,21 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       !      -3 |-2 |-1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7
       !
       fpanel      (1-nht:nc     ,1-nht:nc+nht  )=fcube(1-nht:nc     ,1-nht:nc+nht)
-      w = halo_interp_weight(:,:,:,1)
       do halo=1,nhr
+        ftmp(:) = fcube(nc+halo,:)   ! copy to a temporary
         do i=halo-nh,nc+nh-(halo-1)
           ibaseref = ibase(i,halo,1)
-          fpanel      (nc+halo   ,i  ) = matmul_w(w(:,i,halo),fcube(nc  +halo,ibaseref:ibaseref+ns-1),ns)
+          fpanel      (nc+halo   ,i  ) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
         end do
       end do
 
       if (present(fotherpanel)) then
         fotherpanel (nc+1 :nc+nht ,1-nht:nc+nht,1)=fcube(nc+1 :nc+nht ,1-nht:nc+nht) !
         do halo=1,nhr
+          ftmp(:) = fcube(nc+1-halo,:)   ! copy to a temporary
           do i=halo-nh,nc+nh-(halo-1)
-            !           ibaseref=ibase(i,halo,1 )
             ibaseref = ibase(i,halo,1)
-            fotherpanel (nc+1-halo ,i,1) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns)
+            fotherpanel (nc+1-halo ,i,1) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
           end do
         end do
       end if
@@ -905,11 +927,10 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       !
       ! fill in values that are on the same projection as "main" element
       fpanel      (1-nht:nc+nht ,1-nht:nc)=fcube(1-nht:nc+nht ,1-nht:nc)
-      w = halo_interp_weight(:,:,:,1)
       do halo=1,nhr
         do i=halo-nh,nc+nh-(halo-1)
           ibaseref = ibase(i,halo,1)
-          fpanel      (i,nc+halo    ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+halo  ),ns) !north
+          fpanel      (i,nc+halo    ) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+halo),ns) !north
         end do
       end do
       if (present(fotherpanel)) then
@@ -919,7 +940,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
         do halo=1,nhr
           do i=halo-nh,nc+nh-(halo-1)
             ibaseref = ibase(i,halo,1)
-            fotherpanel (i,nc+1-halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
+            fotherpanel (i,nc+1-halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
           end do
         end do
       end if
@@ -960,19 +981,19 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       ! fill in values that are on the same projection as "main" element (marked with "i" in Figure above)
       !
       fpanel      (1-nht:nc+nht,1:nc+nht  )=fcube(1-nht:nc+nht,1:nc+nht)
-      w = halo_interp_weight(:,:,:,1)
       do halo=1,nhr
         do i=halo-nh,nc+nh-(halo-1)
           ibaseref=ibase(i,halo,1)!ibase(i,halo,2)
-          fpanel      (i,1-halo ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,1-halo),ns)  !south
+          fpanel      (i,1-halo ) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,1-halo),ns)  !south
         end do
       end do
       if (present(fotherpanel)) then
         fotherpanel (1-nht:nc+nht,1-nht:0 ,1)=fcube(1-nht:nc+nht,1-nht:0 )
         do halo=1,nhr
+          ftmp(:) = fcube(:,halo)
           do i=halo-nh,nc+nh-(halo-1)
             ibaseref=ibase(i,halo,1)!ibase(i,halo,2)
-            fotherpanel (i,  halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,  halo),ns)
+            fotherpanel (i,  halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,  halo),ns)
           end do
         end do
       end if
@@ -1018,12 +1039,12 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       !
       ! fill in west part (marked with "w" on Figure above) and south part (marked with "s")
       !
-      w = halo_interp_weight(:,:,:,1)
       do halo=1,nhr
+        ftmp(:)  = fcube(1-halo,:)   ! copy to a temporary
         do i=max(halo-nh,0),nc+nh-(halo-1)
           ibaseref=ibase(i,halo,1)!ibase(i,halo,1)
-          fpanel(1-halo ,i) = matmul_w(w(:,i,halo),fcube(1-halo ,ibaseref:ibaseref+ns-1),ns) !west
-          fpanel(i,1-halo ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,1-halo) ,ns)  !south
+          fpanel(1-halo ,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)   !west
+          fpanel(i,1-halo ) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,1-halo) ,ns)  !south
         end do
       end do
       !
@@ -1075,20 +1096,18 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
         !
         ! compute interpolated cell average values in "p" cells on Figure on above
         !
-        w = halo_interp_weight(:,:,:,1)
         do halo=1,nhr
           do i=max(halo-nh,0),nc+nh-(halo-1)
             ibaseref=ibase(i,halo,1)
             !
             ! use same weights as interpolation south from main panel (symmetric)
             !
-            fotherpanel(i,halo,1)  = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,halo),ns)
+            fotherpanel(i,halo,1)  = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,halo),ns)
           end do
         end do
         !
         ! compute interpolated cell average values in "w" cells on Figure on above
         !
-        w = halo_interp_weight(:,:,:,2)
         do halo=1,nhr
           do i=nc+halo-nhr,nc+1
             ibaseref=ibase(i,halo,2)-nc
@@ -1108,7 +1127,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
             ! !              |              |
             ! ===============================
             !
-            fotherpanel(1-halo,i-nc,1)  = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,halo),ns)
+            fotherpanel(1-halo,i-nc,1)  = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,halo),ns)
           end do
         end do
         fotherpanel(0,1,1) = 0.25_r8*(fotherpanel(-1,1,1)+fotherpanel(1,1,1)+fotherpanel(0,2,1)+fotherpanel(0,0,1))
@@ -1164,21 +1183,21 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
         !
         ! compute interpolated cell average values in "p" cells on Figure on above
         !
-        w = halo_interp_weight(:,:,:,1) ! symmetry
         do halo=1,nhr
+          ftmp(:) = fcube(halo,:)   ! copy to a temporary
           do i=max(halo-nh,0),nc+nh-(halo-1)
             ibaseref=ibase(i,halo,1)
             !
             ! use same weights as interpolation south from main panel (symmetric)
             !
-            fotherpanel(halo,i,2)  = matmul_w(w(:,i,halo),fcube(halo,ibaseref:ibaseref+ns-1),ns)
+            fotherpanel(halo,i,2)  = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
           end do
         end do
         !
         ! compute interpolated cell average values in "s" cells on Figure on above
         !
-        w = halo_interp_weight(:,:,:,2)
         do halo=1,nhr
+          ftmp(:) = fcube(halo,:)   ! copy to a temporary
           do i=nc+halo-nhr,nc+1
             ibaseref=ibase(i,halo,2)-nc
             !
@@ -1197,7 +1216,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
             ! !              |              |
             ! ===============================
             !
-            fotherpanel(i-nc,1-halo,2)  = matmul_w(w(:,i,halo),fcube(halo,ibaseref:ibaseref+ns-1),ns)
+            fotherpanel(i-nc,1-halo,2)  = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns)
           end do
         end do
         fotherpanel(1,0,2) = 0.25_r8*(fotherpanel(0,0,2)+fotherpanel(2,0,2)+fotherpanel(1,-1,2)+fotherpanel(1,1,2))
@@ -1235,21 +1254,20 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       !
       ! east
       !
-      w = halo_interp_weight(:,:,:,1)
       do halo=1,nhr
+        ftmp(:) = fcube(nc+halo,:)   ! copy to a temporary
         do i=max(halo-nh,0),nc+nh-(halo-1)
           ibaseref = ibase(i,halo,1)
-          fpanel(nc+halo,i) = matmul_w(w(:,i,halo),fcube(nc  +halo,ibaseref:ibaseref+ns-1),ns)
+          fpanel(nc+halo,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
         end do
       end do
       !
       ! south
       !
-      w = halo_interp_weight(:,:,:,2)
       do halo=1,nhr
         do i=halo-nh,min(nc+nh-(halo-1),nc+1)
           ibaseref = ibase(i,halo,2)
-          fpanel(i,1-halo ) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,1-halo),ns)  !south
+          fpanel(i,1-halo ) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,1-halo),ns)  !south
         end do
       end do
       fpanel(nc+1,0   )=0.25_r8*(&
@@ -1299,20 +1317,19 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       if (present(fotherpanel)) then
         fotherpanel(1-nht:nc,1-nht:0,1)  = fcube(1-nht:nc,1-nht:0)
         !
-        w = halo_interp_weight(:,:,:,2)
         !
         ! fill in "n" on Figure above
         !
         do halo=1,nhr
+          ftmp(:) = fcube(:,halo)
           do i=halo-nh,min(nc+nh-(halo-1),nc+1)
             ibaseref = ibase(i,halo,2)
-            fotherpanel (i,halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,  halo),ns)
+            fotherpanel (i,halo,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,  halo),ns)
           end do
         end do
         !
         ! fill in "e" on Figure above
         !
-        w = halo_interp_weight(:,:,:,1)
         do halo=1,nhr
           do i=0,nht-halo!nc+nh-(halo-1)
             ibaseref = ibase(i,halo,1)
@@ -1322,7 +1339,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
             ! use symmetry for weights (same weights as East from main panel but for south panel
             ! projection the indecies are rotated)
             !
-            fotherpanel (nc+halo ,1-i,1) = matmul_w(w(:,i,halo),fcube(nc+ibaseref:nc+ibaseref+ns-1,halo),ns)
+            fotherpanel (nc+halo ,1-i,1) = dotproduct(hWeight(:,i,halo,1),fcube(nc+ibaseref:nc+ibaseref+ns-1,halo),ns)
           end do
         end do
         fotherpanel(nc+1,1,1) = 0.25_r8*(fotherpanel(nc+2,1,1)+fotherpanel(nc,1,1)&
@@ -1378,18 +1395,18 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
         !
         ! fill in "w" on Figure above
         !
-        w = halo_interp_weight(:,:,:,1)
         do halo=1,nhr
+          ftmp(:) = fcube(nc+1-halo,:)   ! copy to a temporary
           do i=0,nc+nh-(halo-1)
             ibaseref = ibase(i,halo,1)
-            fotherpanel(nc+1-halo,i,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns)
+            fotherpanel(nc+1-halo,i,2) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
           end do
         end do
         !
         ! fill in "s" on Figure above
         !
-        w = halo_interp_weight(:,:,:,2)
         do halo=1,nhr
+          ftmp(:) = fcube(nc+1-halo,:)   ! copy to a temporary
           do i=nc+1-nht+halo,nc+1
             !
             !
@@ -1417,7 +1434,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
             !
             ! fcube index: due to rotation (see Figure above)
             !
-            fotherpanel(nc+(nc+1-i),1-halo,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns)
+            fotherpanel(nc+(nc+1-i),1-halo,2) = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns)
           end do
         end do
         fotherpanel(nc,0,2) = 0.25_r8*(fotherpanel(nc+1,0,2)+fotherpanel(nc-1,0,2)&
@@ -1454,21 +1471,20 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
       !
       ! west
       !
-      w = halo_interp_weight(:,:,:,1)
       do halo=1,nhr
+        ftmp(:) = fcube(1-halo,:)   ! copy to a temporary
         do i=halo-nh,min(nc+nh-(halo-1),nc+1)
           ibaseref=ibase(i,halo,1)
-          fpanel(1-halo ,i) = matmul_w(w(:,i,halo),fcube(1-halo ,ibaseref:ibaseref+ns-1),ns)
+          fpanel(1-halo ,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
         end do
       end do
       !
       ! north
       !
-      w = halo_interp_weight(:,:,:,2)
       do halo=1,nhr
         do i=max(halo-nh,0),nc+nh-(halo-1)
            ibaseref = ibase(i,halo,2)
-           fpanel(i,nc+halo) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+halo  ),ns) !north
+           fpanel(i,nc+halo) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,nc+halo  ),ns) !north
          end do
        end do
        fpanel(0   ,nc+1)=0.25_r8*(&
@@ -1509,11 +1525,10 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
          !
          ! (use code from north above)
          !
-         w = halo_interp_weight(:,:,:,2)
          do halo=1,nhr
            do i=max(halo-nh,0),nc+nh-(halo-1)
              ibaseref = ibase(i,halo,2)
-             fotherpanel(i,nc+1-halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo  ),ns)
+             fotherpanel(i,nc+1-halo,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,nc+1-halo  ),ns)
            end do
          end do
          !
@@ -1521,11 +1536,10 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
          !
          ! (use code from west above)
          !
-         w = halo_interp_weight(:,:,:,1)
          do halo=1,nhr
            do i=nc+1-nht+halo,nc+1
              ibaseref=ibase(i,halo,1)-nc
-             fotherpanel(1-halo,nc-(i-(nc+1)),1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
+             fotherpanel(1-halo,nc-(i-(nc+1)),1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
            end do
          end do
          fotherpanel(0,nc,1)=0.25_r8*(&
@@ -1574,11 +1588,11 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
          !
          ! (use code from west above)
          !
-         w = halo_interp_weight(:,:,:,1)
          do halo=1,nhr
+           ftmp(:) = fcube(halo,:)   ! copy to a temporary
            do i=halo-nh,min(nc+nh-(halo-1),nc+1)
              ibaseref=ibase(i,halo,1)
-             fotherpanel(halo ,i,2) = matmul_w(w(:,i,halo),fcube(halo ,ibaseref:ibaseref+ns-1),ns)
+             fotherpanel(halo ,i,2) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
            end do
          end do
          !
@@ -1587,11 +1601,11 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
          !
          ! (use code from north above)
          !
-         w = halo_interp_weight(:,:,:,2)
          do halo=1,nhr
+           ftmp(:) = fcube(halo,:)   ! copy to a temporary
            do i=0,nht-halo
              ibaseref = ibase(i,halo,2)+nc
-             fotherpanel(1-i,nc+halo,2) = matmul_w(w(:,i,halo),fcube(halo,ibaseref:ibaseref+ns-1),ns) !north
+             fotherpanel(1-i,nc+halo,2) = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns) !north
            end do
          end do
          fotherpanel(1,nc+1,2)=0.25_r8*(&
@@ -1630,21 +1644,20 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
        !
        ! east
        !
-       w = halo_interp_weight(:,:,:,1)
        do halo=1,nhr
+         ftmp(:) = fcube(nc+halo,:)   ! copy to a temporary
          do i=halo-nh,min(nc+nh-(halo-1),nc+1)
            ibaseref=ibase(i,halo,1 )
-           fpanel(nc+halo,i) = matmul_w(w(:,i,halo),fcube(nc  +halo,ibaseref:ibaseref+ns-1),ns)
+           fpanel(nc+halo,i) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
          end do
        end do
        !
        ! north
        !
-       !     w = halo_interp_weight(:,:,:,1)
        do halo=1,nhr
          do i=halo-nh,min(nc+nh-(halo-1),nc+1)
            ibaseref=ibase(i,halo,1)
-           fpanel(i,nc+halo) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+halo  ),ns) !north
+           fpanel(i,nc+halo) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+halo  ),ns) !north
          end do
        end do
        fpanel(nc+1,nc+1)=0.25_r8*(&
@@ -1697,17 +1710,15 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
          !
          ! (use north case from above and shift/reverse j-index
          !
-         w = halo_interp_weight(:,:,:,1)
          do halo=1,nhr
            do i=halo-nh,min(nc+nh-(halo-1),nc+1)
              ibaseref=ibase(i,halo,1)
-             fotherpanel (i,nc+1-halo,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
+             fotherpanel (i,nc+1-halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
            end do
          end do
          !
          ! fill in "e" on Figure above
          !
-         w = halo_interp_weight(:,:,:,2)
          do halo=1,nhr
            do i=max(halo-nh,0),nht-halo
              ibaseref=ibase(i,halo,2) +nc
@@ -1715,7 +1726,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
              ! fotherpanel uses indexing of main panel's projection
              ! fcube: rotated indexing
              !
-             fotherpanel (nc+halo,nc+i,1) = matmul_w(w(:,i,halo),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
+             fotherpanel (nc+halo,nc+i,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,nc+1-halo),ns)
            end do
          end do
          fotherpanel(nc+1,nc,1)=0.25_r8*(&
@@ -1774,25 +1785,25 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,halo
          !
          ! (use east case from above and shift/reverse j-index
          !
-         w = halo_interp_weight(:,:,:,1)
          do halo=1,nhr
+           ftmp(:) = fcube(nc+1-halo,:)   ! copy to a temporary
            do i=halo-nh,min(nc+nh-(halo-1),nc+1)
              ibaseref=ibase(i,halo,1 )
-             fotherpanel(nc+1-halo,i,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns)
+             fotherpanel(nc+1-halo,i,2) = dotproduct(hWeight(:,i,halo,1),ftmp(ibaseref:ibaseref+ns-1),ns)
            end do
          end do
          !
          ! fill in "n" on Figure above
          !
-         w = halo_interp_weight(:,:,:,2)
          do halo=1,nhr
+           ! ftmp(:) = fcube(nc+1-halo,:)   ! copy to a temporary
            do i=max(halo-nh,0),nht-halo
              ibaseref=ibase(i,halo,2) +nc
              !
              ! fotherpanel uses indexing of main panel's projection
              ! fcube: rotated indexing
              !
-             fotherpanel (nc+i,nc+halo,2) = matmul_w(w(:,i,halo),fcube(nc+1-halo,ibaseref:ibaseref+ns-1),ns)
+             fotherpanel (nc+i,nc+halo,2) = dotproduct(hWeight(:,i,halo,2),ftmp(ibaseref:ibaseref+ns-1),ns)
            end do
          end do
          fotherpanel(nc,nc+1,2)=0.25_r8*(&

From 948926f154988833eaac9ada947e494dd36604b5 Mon Sep 17 00:00:00 2001
From: Peter Hjort Lauritzen <pel@ucar.edu>
Date: Mon, 18 Aug 2025 08:14:41 -0600
Subject: [PATCH 2/9] update ChangeLog

---
 doc/ChangeLog | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/doc/ChangeLog b/doc/ChangeLog
index 2381ecdeb7..9a6a924dd1 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,39 @@
+===============================================================
+
+Tag name: cam6_
+Originator(s): johnmauff, pel
+Date: April 28, 2025
+One-line Summary: 
+Github PR URL: https://github.com/ESCOMP/CAM/pull/
+
+Purpose of changes (include the issue number and title text for each relevant GitHub issue):
+   Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1316
+
+   The subroutine extend_panel_interpolate is written such that the compiler will generate more data movement than is necessary.
+   This excessive data movement intensifies a computational load imbalance in the CSLAM advection. While it is impossible to eliminate
+   the load imbalance that is caused by the special treatment of panels at the corners of the cubed sphere, it is possible to reduce
+   the cost of this subroutine by changing the way that the subroutine is written.
+
+Describe any changes made to build system: N/A
+
+Describe any changes made to the namelist: N/A
+
+List any changes to the defaults for the boundary datasets: N/A
+
+Describe any substantial timing or memory changes: N/A
+
+Code reviewed by:
+
+List all files eliminated: N/A
+
+List all files added and what they do: N/A
+
+List all existing files that have been modified, and describe the changes:
+
+....
+
+Summarize any changes to answers:
+	No answer changes, all b4b
 
 ===============================================================
 

From e6d5b831d8b43847375c33355f2b518558d3b8c6 Mon Sep 17 00:00:00 2001
From: Peter Hjort Lauritzen <pel@ucar.edu>
Date: Wed, 20 Aug 2025 03:35:39 -0600
Subject: [PATCH 3/9] remove unused variables

---
 doc/ChangeLog                                     | 2 +-
 src/dynamics/se/dycore/fvm_reconstruction_mod.F90 | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/doc/ChangeLog b/doc/ChangeLog
index 9a6a924dd1..59086e3916 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -3,7 +3,7 @@
 Tag name: cam6_
 Originator(s): johnmauff, pel
 Date: April 28, 2025
-One-line Summary: 
+One-line Summary:
 Github PR URL: https://github.com/ESCOMP/CAM/pull/
 
 Purpose of changes (include the issue number and title text for each relevant GitHub issue):
diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
index fad8f41cb5..13e1aa0cf2 100644
--- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
@@ -990,7 +990,6 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWei
       if (present(fotherpanel)) then
         fotherpanel (1-nht:nc+nht,1-nht:0 ,1)=fcube(1-nht:nc+nht,1-nht:0 )
         do halo=1,nhr
-          ftmp(:) = fcube(:,halo)
           do i=halo-nh,nc+nh-(halo-1)
             ibaseref=ibase(i,halo,1)!ibase(i,halo,2)
             fotherpanel (i,  halo,1) = dotproduct(hWeight(:,i,halo,1),fcube(ibaseref:ibaseref+ns-1,  halo),ns)
@@ -1321,7 +1320,6 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWei
         ! fill in "n" on Figure above
         !
         do halo=1,nhr
-          ftmp(:) = fcube(:,halo)
           do i=halo-nh,min(nc+nh-(halo-1),nc+1)
             ibaseref = ibase(i,halo,2)
             fotherpanel (i,halo,1) = dotproduct(hWeight(:,i,halo,2),fcube(ibaseref:ibaseref+ns-1,  halo),ns)

From 9e7393bd10c3dc9aa5fdac5a1f8281dc978160af Mon Sep 17 00:00:00 2001
From: Peter Hjort Lauritzen <pel@ucar.edu>
Date: Thu, 21 Aug 2025 07:24:18 -0600
Subject: [PATCH 4/9] fix bug

---
 src/dynamics/se/dycore/fvm_reconstruction_mod.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
index 13e1aa0cf2..51a432fba6 100644
--- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
@@ -1794,7 +1794,7 @@ subroutine extend_panel_interpolate(nc,nhc,nhr,nht,ns,nh,fcube,cubeboundary,hWei
          ! fill in "n" on Figure above
          !
          do halo=1,nhr
-           ! ftmp(:) = fcube(nc+1-halo,:)   ! copy to a temporary
+           ftmp(:) = fcube(nc+1-halo,:)   ! copy to a temporary
            do i=max(halo-nh,0),nht-halo
              ibaseref=ibase(i,halo,2) +nc
              !

From 8c08ece782f6c22b209aeea4440b77c9c270330f Mon Sep 17 00:00:00 2001
From: Peter Hjort Lauritzen <pel@ucar.edu>
Date: Thu, 23 Oct 2025 04:03:45 -0600
Subject: [PATCH 5/9] better vectorization for large NTRAC, ifdef for timers

---
 .../se/dycore/fvm_consistent_se_cslam.F90     | 57 ++++++++++---------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90 b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
index d3becc8446..3e56498b38 100644
--- a/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
+++ b/src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
@@ -1,3 +1,4 @@
+#define FVM_TIMERS .FALSE.
 module fvm_consistent_se_cslam
   use shr_kind_mod,           only: r8=>shr_kind_r8
   use dimensions_mod,         only: nc, nhe, nlev, ntrac, np, nhr, nhc, ngpc, ns, nht
@@ -107,7 +108,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
     endif
 
     kblk = kmax-kmin+1
-    !call t_startf('fvm:before_Qnhc')
+    if(FVM_TIMERS) call t_startf('fvm:before_Qnhc')
     do ie=nets,nete
        do k=kmin,kmax
           elem(ie)%sub_elem_mass_flux(:,:,:,k) = dt_fvm*elem(ie)%sub_elem_mass_flux(:,:,:,k)*fvm(ie)%dp_ref_inverse(k)
@@ -120,11 +121,11 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
           call ghostpack(ghostbufQnhc,fvm(ie)%c(1-nhc:nc+nhc,1-nhc:nc+nhc,kmin:kmax,q),kblk,kptr,ie)
        enddo
     end do
-    !call t_stopf('fvm:before_Qnhc')
-    !call t_startf('fvm:ghost_exchange:Qnhc')
+    if(FVM_TIMERS) call t_stopf('fvm:before_Qnhc')
+    if(FVM_TIMERS) call t_startf('fvm:ghost_exchange:Qnhc')
     call ghost_exchange(hybridnew,ghostbufQnhc,location='ghostbufQnhc')
-    !call t_stopf('fvm:ghost_exchange:Qnhc')
-    !call t_startf('fvm:orthogonal_swept_areas')
+    if(FVM_TIMERS) call t_stopf('fvm:ghost_exchange:Qnhc')
+    if(FVM_TIMERS) call t_startf('fvm:orthogonal_swept_areas')
     do ie=nets,nete
       do k=kmin,kmax
         fvm(ie)%se_flux    (1:nc,1:nc,:,k) = elem(ie)%sub_elem_mass_flux(:,:,:,k)
@@ -152,14 +153,14 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
       end do
     enddo
 
-    !call t_stopf('fvm:orthogonal_swept_areas')
+    if(FVM_TIMERS) call t_stopf('fvm:orthogonal_swept_areas')
     do ie=nets,nete
        ! Intel compiler version 2023.0.0 on derecho had significant slowdown on subroutine interface without
        ! these pointers.
       fcube => fvm(ie)%c(:,:,:,:)
       spherecentroid => fvm(ie)%spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe)
       do k=kmin,kmax
-         !call t_startf('FVM:tracers_reconstruct')
+         if(FVM_TIMERS) call t_startf('FVM:tracers_reconstruct')
          call reconstruction(fcube,nlev,k,&
              ctracer(:,:,:,:),irecons_tracer,llimiter,ntrac,&
              nc,nhe,nhr,nhc,nht,ns,nhr+(nhe-1),&
@@ -170,10 +171,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
              fvm(ie)%rot_matrix,fvm(ie)%centroid_stretch,&
              fvm(ie)%vertex_recons_weights,fvm(ie)%vtx_cart,&
              irecons_tracer_lev(k))
-         !call t_stopf('FVM:tracers_reconstruct')
-         !call t_startf('fvm:swept_flux')
+         if(FVM_TIMERS) call t_stopf('FVM:tracers_reconstruct')
+         if(FVM_TIMERS) call t_startf('fvm:swept_flux')
          call swept_flux(elem(ie),fvm(ie),k,ctracer,irecons_tracer_lev(k),gsweights,gspts)
-         !call t_stopf('fvm:swept_flux')
+         if(FVM_TIMERS) call t_stopf('fvm:swept_flux')
       end do
     end do
     !
@@ -193,7 +194,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
     !
     !
     if (large_Courant_incr) then
-      !call t_startf('fvm:fill_halo_fvm:large_Courant')
+      if(FVM_TIMERS) call t_startf('fvm:fill_halo_fvm:large_Courant')
       !if (kmin_jet<kmin.or.kmax_jet>kmax) then
       !  call endrun('ERROR: kmax_jet must be .le. kmax passed to run_consistent_se_cslam')
       !end if      
@@ -203,8 +204,8 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
       kmax_jet_local = min(kmax_jet,kmax)
       klev = kmax_jet-kmin_jet+1
       call fill_halo_fvm(ghostbufQ1,elem,fvm,hybridnew,nets,nete,1,kmin_jet_local,kmax_jet_local,klev,active=ActiveJetThread)
-      !call t_stopf('fvm:fill_halo_fvm:large_Courant')
-      !call t_startf('fvm:large_Courant_number_increment')
+      if(FVM_TIMERS) call t_stopf('fvm:fill_halo_fvm:large_Courant')
+      if(FVM_TIMERS) call t_startf('fvm:large_Courant_number_increment')
       if(ActiveJetThread) then 
         do k=kmin_jet_local,kmax_jet_local !1,nlev
           do ie=nets,nete
@@ -212,10 +213,10 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
           end do
         end do
       endif
-      !call t_stopf('fvm:large_Courant_number_increment')
+      if(FVM_TIMERS) call t_stopf('fvm:large_Courant_number_increment')
     end if
 
-    !call t_startf('fvm:end_of_reconstruct_subroutine')
+    if(FVM_TIMERS) call t_startf('fvm:end_of_reconstruct_subroutine')
     do k=kmin,kmax
       !
       ! convert to mixing ratio
@@ -251,7 +252,7 @@ subroutine run_consistent_se_cslam(elem,fvm,hybrid,dt_fvm,tl,nets,nete,hvcoord,&
         elem(ie)%sub_elem_mass_flux(:,:,:,k)=0
       end do
     end do
-    !call t_stopf('fvm:end_of_reconstruct_subroutine')
+    if(FVM_TIMERS) call t_stopf('fvm:end_of_reconstruct_subroutine')
     !$OMP END PARALLEL 
     call omp_set_nested(.false.)
   end subroutine run_consistent_se_cslam
@@ -281,7 +282,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
     REAL(KIND=r8), dimension(2,8) :: x_start, dgam_vec
     REAL(KIND=r8) :: gamma_max, displ_first_guess
 
-    REAL(KIND=r8) :: flux,flux_tracer(ntrac)
+    REAL(KIND=r8) :: flux,flux_tracer(ntrac),w
 
     REAL(KIND=r8), dimension(num_area) :: dp_area
 
@@ -306,7 +307,6 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
     !
     ! prepare for air/tracer update
     !
-!    dp = fvm%dp_fvm(1-nhe:nc+nhe,1-nhe:nc+nhe,ilev)
     dp = fvm%dp_fvm(1-nhc:nc+nhc,1-nhc:nc+nhc,ilev)
     fvm%dp_fvm(1:nc,1:nc,ilev) = fvm%dp_fvm(1:nc,1:nc,ilev)*fvm%area_sphere
     do itr=1,ntrac
@@ -538,14 +538,14 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
             !
             ! iterate to get flux area
             !
-            !call t_startf('fvm:swept_area:get_gamma')
+            if(FVM_TIMERS) call t_startf('fvm:swept_area:get_gamma')
             do iarea=1,num_area
               dp_area(iarea) = dp(idx(1,iarea,i,j,iside),idx(2,iarea,i,j,iside))
             end do
             call get_flux_segments_area_iterate(x,x_static,dx_static,dx,x_start,dgam_vec,num_seg,num_seg_static,&
                  num_seg_max,num_area,dp_area,flowcase,gamma,mass_flux_se(i,j,iside),0.0_r8,gamma_max,          &
                  gsweights,gspts,ilev)
-            !call t_stopf('fvm:swept_area:get_gamma')
+            if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_gamma')
             !
             ! pack segments for high-order weights computation
             !
@@ -560,10 +560,10 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
             !
             ! compute higher-order weights
             !
-            !call t_startf('fvm:swept_area:get_high_order_w')
+            if(FVM_TIMERS) call t_startf('fvm:swept_area:get_high_order_w')
             call get_high_order_weights_over_areas(x,dx,num_seg,num_seg_max,num_area,weights,ngpc,&
                  gsweights, gspts,irecons_tracer)
-            !call t_stopf('fvm:swept_area:get_high_order_w')
+            if(FVM_TIMERS) call t_stopf('fvm:swept_area:get_high_order_w')
             !
             !**************************************************
             !
@@ -571,16 +571,17 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
             !
             !**************************************************
             !
-            !call t_startf('fvm:swept_area:remap')
+            if(FVM_TIMERS) call t_startf('fvm:swept_area:remap')
             flux=0.0_r8; flux_tracer=0.0_r8
             do iarea=1,num_area
               if (num_seg(iarea)>0) then
                 ii=idx(1,iarea,i,j,iside); jj=idx(2,iarea,i,j,iside)
                 flux=flux+weights(1,iarea)*dp(ii,jj)
-                do itr=1,ntrac
-                  do iw=1,irecons_tracer_actual
-                    flux_tracer(itr) = flux_tracer(itr)+weights(iw,iarea)*ctracer(iw,ii,jj,itr)
-                  end do
+                do iw=1,irecons_tracer_actual
+                   w = weights(iw,iarea)
+                   do itr=1,ntrac
+                      flux_tracer(itr) = flux_tracer(itr)+w*ctracer(iw,ii,jj,itr)
+                   end do
                 end do
               end if
             end do
@@ -614,7 +615,7 @@ subroutine swept_flux(elem,fvm,ilev,ctracer,irecons_tracer_actual,gsweights,gspt
               fvm%dp_fvm(i-1,j,ilev        ) = fvm%dp_fvm(i-1,j,ilev        )+flux
               fvm%     c(i-1,j,ilev,1:ntrac) = fvm%     c(i-1,j,ilev,1:ntrac)+flux_tracer(1:ntrac)
             end if
-            !call t_stopf('fvm:swept_area:remap')
+            if(FVM_TIMERS) call t_stopf('fvm:swept_area:remap')
           end if
         end do
       end do

From 6aefddb7c615a4538d25653223862268e8e28b3c Mon Sep 17 00:00:00 2001
From: Peter Hjort Lauritzen <pel@ucar.edu>
Date: Thu, 23 Oct 2025 04:14:05 -0600
Subject: [PATCH 6/9] better performance for large tracer counts

---
 .../se/dycore/fvm_reconstruction_mod.F90      | 435 +++++++++---------
 1 file changed, 224 insertions(+), 211 deletions(-)

diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
index 51a432fba6..fd3dbe4716 100644
--- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
@@ -79,7 +79,7 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
 
     real (kind=r8), dimension(1-nht:nc+nht,1-nht:nc+nht,3) :: f
 
-    integer :: i,j,in,h,itr
+    integer :: i,j,in,h,itr,k
     integer,               dimension(2,3)                              :: jx,jy
 
     if (present(irecons_actual_in)) then 
@@ -87,7 +87,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
     else
        irecons_actual = irecons
     end if
-    
 
     jx(1,1)=jx_min(1); jx(2,1)=jx_max(1)-1
     jx(1,2)=jx_min(2); jx(2,2)=jx_max(2)-1
@@ -119,42 +118,12 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
           end do
        end if
        if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#1')
-       if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2')       
-       !
-       ! fill in non-existent (in physical space) corner values to simplify
-       ! logic in limiter code (min/max operation)
-       !
-       do itr=1,ntrac_in
-          if (llimiter(itr)) then
-             if (cubeboundary>4) then
-                select case(cubeboundary)
-                case (nwest)
-                   do h=1,nhe+1
-                      fcube(0,nc+h  ,k_in,itr) = fcube(1-h,nc  ,k_in,itr)
-                      fcube(1-h,nc+1,k_in,itr) = fcube(1  ,nc+h,k_in,itr)
-                   end do
-                case (swest)
-                   do h=1,nhe+1
-                      fcube(1-h,0,k_in,itr) = fcube(1,1-h,k_in,itr)
-                      fcube(0,1-h,k_in,itr) = fcube(1-h,1,k_in,itr)
-                   end do
-                case (seast)
-                   do h=1,nhe+1
-                      fcube(nc+h,0  ,k_in,itr) = fcube(nc,1-h,k_in,itr)
-                      fcube(nc+1,1-h,k_in,itr) = fcube(nc+h,1,k_in,itr)
-                   end do
-                case (neast)
-                   do h=1,nhe+1
-                      fcube(nc+h,nc+1,k_in,itr) = fcube(nc,nc+h,k_in,itr)
-                      fcube(nc+1,nc+h,k_in,itr) = fcube(nc+h,nc,k_in,itr)
-                   end do
-                end select
-             end if
-             call slope_limiter(nhe,nc,nhc,fcube(:,:,k_in,itr),jx,jy,irecons,recons(:,:,:,itr),&
-                  spherecentroid,&
-                  recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual)
-          end if
-       end do
+       if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#2')
+
+       call slope_limiter(nhe,nc,nhc,fcube,jx,jy,k_in,nlev_in,ntrac_in,irecons,recons,&
+            spherecentroid(:,1-nhe:nc+nhe,1-nhe:nc+nhe),&
+            recons_metrics,vertex_recons_weights,vtx_cart,irecons_actual,llimiter,&
+            cubeboundary)
        if(FVM_TIMERS) call t_stopf('FVM:reconstruction:part#2')
     end if
     if(FVM_TIMERS) call t_startf('FVM:reconstruction:part#3')
@@ -185,8 +154,6 @@ subroutine reconstruction(fcube,nlev_in,k_in,recons,irecons,llimiter,ntrac_in,&
       end do
     case(6)
       do itr=1,ntrac_in
-!        do j=1-nhe,nc+nhe
-!          do i=1-nhe,nc+nhe
         do in=1,3
           do j=jy(1,in),jy(2,in)
             do i=jx(1,in),jx(2,in)
@@ -331,31 +298,67 @@ subroutine get_gradients(f,jx,jy,irecons,gradient,rot_matrix,centroid_stretch,nc
   end subroutine get_gradients
 
 
-  subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,recons_metrics,&
-       vertex_recons_weights,vtx_cart,irecons_actual)
+  subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,spherecentroid,recons_metrics,&
+       vertex_recons_weights,vtx_cart,irecons_actual,llimiter,cubeboundary)
     implicit none
-    integer                                                           , intent(in) :: irecons,nhe,nc,nhc,irecons_actual
-    real (kind=r8), dimension(1-nhc:, 1-nhc:)                         , intent(in) :: fcube
-    real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe)      , intent(inout):: recons
+    integer                                                           , intent(in) :: irecons_actual,k,nlev,ntrac
+    integer                                                           , intent(in) :: irecons,nhe,nc,nhc
+    real (kind=r8), dimension(1-nhc:nc+nhc,1-nhc:nc+nhc,nlev,ntrac)   , intent(inout) :: fcube
+    real (kind=r8), dimension(irecons,1-nhe:nc+nhe,1-nhe:nc+nhe,ntrac), intent(inout):: recons
     integer,               dimension(2,3)                             , intent(in) :: jx,jy
     real (kind=r8), dimension(irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe)    , intent(in) :: spherecentroid
     real (kind=r8), dimension(3,1-nhe:nc+nhe,1-nhe:nc+nhe)            , intent(in) :: recons_metrics
     real (kind=r8), dimension(4,1:irecons-1,1-nhe:nc+nhe,1-nhe:nc+nhe), intent(in) :: vertex_recons_weights
     real (kind=r8), dimension(4,2,1-nhc:nc+nhc,1-nhc:nc+nhc)          , intent(in) :: vtx_cart
+    logical,        dimension(ntrac)                                  , intent(in) :: llimiter
+    integer                                                           , intent(in) :: cubeboundary
 
     real (kind=r8):: minval_patch,maxval_patch
     real (kind=r8):: phi, min_val, max_val,disc
+    real (kind=r8):: v1,v2,v3,v4,vx1,vx2,vx3,vx4,vy1,vy2,vy3,vy4,r2,r3,r4,r5,r6,scx,scy,dx,dy,ex1,ex2,f0,val
+    real (kind=r8):: m1,m2,m3
 
     real (kind=r8):: min_phi
     real (kind=r8):: extrema(2), xminmax(2),yminmax(2),extrema_value(13)
 
     real(kind=r8) :: invtmp  ! temporary to pre-compute inverses
-    integer       :: itmp1,itmp2,i,j,in,vertex,n
+    integer       :: itmp1,itmp2,i,j,in,vertex,n,itr,h
 
-!    real (kind=r8), dimension(-1:5) :: diff_value
     real (kind=r8), dimension(-1:1) :: minval_array, maxval_array
     real (kind=r8), parameter :: threshold = 1.0E-40_r8
-    character(len=128)        :: errormsg 
+    character(len=128)        :: errormsg
+    integer :: im1,jm1,ip1,jp1
+           !
+       ! fill in non-existent (in physical space) corner values to simplify
+       ! logic in limiter code (min/max operation)
+       !
+    do itr=1,ntrac
+       if (.not. llimiter(itr)) cycle
+       if (cubeboundary>4) then
+          select case(cubeboundary)
+          case (nwest)
+             do h=1,nhe+1
+                fcube(0,nc+h  ,k,itr) = fcube(1-h,nc  ,k,itr)
+                fcube(1-h,nc+1,k,itr) = fcube(1  ,nc+h,k,itr)
+             end do
+          case (swest)
+             do h=1,nhe+1
+                fcube(1-h,0,k,itr) = fcube(1,1-h,k,itr)
+                fcube(0,1-h,k,itr) = fcube(1-h,1,k,itr)
+             end do
+          case (seast)
+             do h=1,nhe+1
+                fcube(nc+h,0  ,k,itr) = fcube(nc,1-h,k,itr)
+                fcube(nc+1,1-h,k,itr) = fcube(nc+h,1,k,itr)
+             end do
+          case (neast)
+             do h=1,nhe+1
+                fcube(nc+h,nc+1,k,itr) = fcube(nc,nc+h,k,itr)
+                fcube(nc+1,nc+h,k,itr) = fcube(nc+h,nc,k,itr)
+             end do
+          end select
+       end if
+    end do
 
     select case (irecons_actual)
        !
@@ -365,52 +368,51 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re
        do in=1,3
           do j=jy(1,in),jy(2,in)
              do i=jx(1,in),jx(2,in)
-                !rck combined min/max and unrolled inner loop
-                !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1))
-                !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1))
-                !DIR$ SIMD
-                do itmp2=-1,+1
-                   itmp1 = j+itmp2
-                   minval_array(itmp2) = min(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1))
-                   maxval_array(itmp2) = max(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1))
-                enddo
-                minval_patch = min(minval_array(-1),minval_array(0),minval_array(1))
-                maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1))
-                
-                min_phi=1.0_r8
-
-                !
-                ! coordinate bounds (could be pre-computed!)
-                !
-                xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
-                xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
-                yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
-                yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
-                
-                !rck restructured loop
-                !DIR$ SIMD
-                do vertex=1,4
-                  call recons_val_cart_plm(fcube(i,j), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), &
-                       recons(1:3,i,j), extrema_value(vertex))                   
+                do itr = 1, ntrac
+                   if (.not. llimiter(itr)) cycle
+                   !rck combined min/max and unrolled inner loop
+                   !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1))
+                   !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1))
+                   !DIR$ SIMD
+                   do itmp2=-1,+1
+                      itmp1 = j+itmp2
+                      minval_array(itmp2) = min(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr))
+                      maxval_array(itmp2) = max(fcube(i-1,itmp1,k,itr),fcube(i,itmp1,k,itr),fcube(i+1,itmp1,k,itr))
+                   enddo
+                   minval_patch = min(minval_array(-1),minval_array(0),minval_array(1))
+                   maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1))
+                   min_phi=1.0_r8
+                   !
+                   ! coordinate bounds (could be pre-computed!)
+                   !
+                   xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
+                   xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
+                   yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
+                   yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
+                   !rck restructured loop
+                   !DIR$ SIMD
+                   do vertex=1,4
+                      call recons_val_cart_plm(fcube(i,j,k,itr), vtx_cart(vertex,1,i,j), vtx_cart(vertex,2,i,j), spherecentroid(:,i,j), &
+                           recons(1:3,i,j,itr), extrema_value(vertex))
+                   end do
+                   max_val = MAXVAL(extrema_value(1:4))
+                   min_val = MINVAL(extrema_value(1:4))
+                   if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then
+                      phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr))
+                      if (phi<min_phi) min_phi=phi
+                   end if
+                   if (min_val<minval_patch.and.abs(min_val-fcube(i,j,k,itr))>threshold) then
+                      phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr))
+                      if (phi<min_phi) min_phi=phi
+                   end if
+                   ! Apply monotone limiter to all reconstruction coefficients
+                   recons(2:3,i,j,itr)=min_phi*recons(2:3,i,j,itr)
                 end do
-                max_val = MAXVAL(extrema_value(1:4))
-                min_val = MINVAL(extrema_value(1:4))
-
-                if (max_val>maxval_patch.and.abs(max_val-fcube(i,j))>threshold) then
-                  phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j))
-                  if (phi<min_phi) min_phi=phi
-                end if
-                if (min_val<minval_patch.and.abs(min_val-fcube(i,j))>threshold) then
-                  phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j))
-                  if (phi<min_phi) min_phi=phi
-                end if
-                ! Apply monotone limiter to all reconstruction coefficients
-                recons(2:3,i,j)=min_phi*recons(2:3,i,j)                
-              end do
+             end do
           end do
        end do
        !
-       ! PPM limiter
+       ! PPM limiter (optimized)
        !
     case(6)
        !
@@ -419,127 +421,142 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re
        do in=1,3
           do j=jy(1,in),jy(2,in)
              do i=jx(1,in),jx(2,in)
-                !rck combined min/max and unrolled inner loop
-                !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1))
-                !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1))
-                !DIR$ SIMD
-                do itmp2=-1,+1
-                   itmp1 = j+itmp2
-                   minval_array(itmp2) = min(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1))
-                   maxval_array(itmp2) = max(fcube(i-1,itmp1),fcube(i,itmp1),fcube(i+1,itmp1))
-                enddo
-                minval_patch = min(minval_array(-1),minval_array(0),minval_array(1))
-                maxval_patch = max(maxval_array(-1),maxval_array(0),maxval_array(1))
-
-                min_phi=1.0_r8
-                !rck restructured loop
-
-                extrema_value(1:4) = fcube(i,j)
-!DIR$ SIMD
-                do vertex=1,4
-                  do itmp1=1,irecons-1
-                    extrema_value(vertex) = extrema_value(vertex) + &
-                         recons(itmp1+1,i,j)*vertex_recons_weights(vertex,itmp1,i,j)
-                  enddo
-                enddo
-                extrema_value(5:13) = extrema_value(1)
                 !
                 ! coordinate bounds (could be pre-computed!)
                 !
-                xminmax(1) = min(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
-                xminmax(2) = max(vtx_cart(1,1,i,j),vtx_cart(2,1,i,j),vtx_cart(3,1,i,j),vtx_cart(4,1,i,j))
-                yminmax(1) = min(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
-                yminmax(2) = max(vtx_cart(1,2,i,j),vtx_cart(2,2,i,j),vtx_cart(3,2,i,j),vtx_cart(4,2,i,j))
-
-                ! Check if the quadratic is minimized within the element
-                ! Extrema in the interior of the element (there might be just one candidate)
-                ! DO NOT NEED ABS here, if disc<0 we have a saddle point (no maximum or minimum)
-                disc =  4.0_r8 * recons(4,i,j) * recons(5,i,j) - recons(6,i,j)**2
-                if (abs(disc) > threshold) then
-                   extrema(1) = recons(6,i,j) * recons(3,i,j) - 2.0_r8 * recons(5,i,j) * recons(2,i,j)
-                   extrema(2) = recons(6,i,j) * recons(2,i,j) - 2.0_r8 * recons(4,i,j) * recons(3,i,j)
-
-                   disc=1.0_r8/disc
-                   extrema(1) = extrema(1) * disc + spherecentroid(1,i,j)
-                   extrema(2) = extrema(2) * disc + spherecentroid(2,i,j)
-                   if ( (extrema(1) - xminmax(1) > -threshold) .and. &    !xmin
-                        (extrema(1) - xminmax(2) <  threshold) .and. &    !xmax
-                        (extrema(2) - yminmax(1) > -threshold) .and. &    !ymin
-                        (extrema(2) - yminmax(2) <  threshold)) then      !ymax
-                      call recons_val_cart(fcube(i,j), extrema(1), extrema(2), spherecentroid(:,i,j), &
-                           recons_metrics(:,i,j), recons(:,i,j), extrema_value(5))
-                   endif
-                endif
-                !
-                ! Check all potential minimizer points along element boundaries
-                !
-                if (abs(recons(6,i,j)) > threshold) then
-                   invtmp = 1.0_r8 / (recons(6,i,j) + spherecentroid(2,i,j))
-                   do n=1,2
-                      ! Left edge, intercept with du/dx = 0
-                      extrema(2) = invtmp * (-recons(2,i,j) - 2.0_r8 * recons(4,i,j) * (xminmax(n) - spherecentroid(1,i,j)))
-                      if ((extrema(2) > yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then
-                         call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), &
-                              recons_metrics(:,i,j), recons(:,i,j), extrema_value(5+n))
-                      endif
-                   enddo
-                   ! Top/bottom edge, intercept with du/dy = 0
-                   invtmp = 1.0_r8 / recons(6,i,j) + spherecentroid(1,i,j)
-                   do n = 1,2
-                      extrema(1) = invtmp * (-recons(3,i,j) - 2.0_r8 * recons(5,i,j) * (yminmax(n) - spherecentroid(2,i,j)))
-                      if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then
-                         call recons_val_cart(fcube(i,j), extrema(1), yminmax(n),spherecentroid(:,i,j), &
-                              recons_metrics(:,i,j), recons(:,i,j), extrema_value(7+n))
-                      endif
-                   enddo
-                endif
-
-                ! Top/bottom edge, y=const., du/dx=0
-                if (abs(recons(4,i,j)) > threshold) then
-                   invtmp = 1.0_r8 / (2.0_r8 * recons(4,i,j))! + spherecentroid(1,i,j)
-                   do n = 1,2
-                      extrema(1) = spherecentroid(1,i,j)+&
-                           invtmp * (-recons(2,i,j) - recons(6,i,j) * (yminmax(n) - spherecentroid(2,i,j)))
-
-                      if ((extrema(1) > xminmax(1)-threshold) .and. (extrema(1) < xminmax(2)+threshold)) then
-                         call recons_val_cart(fcube(i,j), extrema(1), yminmax(n), spherecentroid(:,i,j),&
-                              recons_metrics(:,i,j),recons(:,i,j), extrema_value(9+n))
-                      endif
-                   enddo
-                endif
-                ! Left/right edge, x=const., du/dy=0
-                if (abs(recons(5,i,j)) > threshold) then
-                   invtmp = 1.0_r8 / (2.0_r8 * recons(5,i,j))
-                   do n = 1,2
-                      extrema(2) = spherecentroid(2,i,j)+&
-                           invtmp * (-recons(3,i,j) - recons(6,i,j) * (xminmax(n) - spherecentroid(1,i,j)))
-
-                      if ((extrema(2)>yminmax(1)-threshold) .and. (extrema(2) < yminmax(2)+threshold)) then
-                         call recons_val_cart(fcube(i,j), xminmax(n), extrema(2), spherecentroid(:,i,j), &
-                              recons_metrics(:,i,j), recons(:,i,j), extrema_value(11+n))
-                      endif
+                vx1 = vtx_cart(1,1,i,j);  vy1 = vtx_cart(1,2,i,j)
+                vx2 = vtx_cart(2,1,i,j);  vy2 = vtx_cart(2,2,i,j)
+                vx3 = vtx_cart(3,1,i,j);  vy3 = vtx_cart(3,2,i,j)
+                vx4 = vtx_cart(4,1,i,j);  vy4 = vtx_cart(4,2,i,j)
+                xminmax(1) = min(vx1,vx2,vx3,vx4)
+                xminmax(2) = max(vx1,vx2,vx3,vx4)
+                yminmax(1) = min(vy1,vy2,vy3,vy4)
+                yminmax(2) = max(vy1,vy2,vy3,vy4)
+                scx = spherecentroid(1,i,j)
+                scy = spherecentroid(2,i,j)
+                m1 = recons_metrics(1,i,j)
+                m2 = recons_metrics(2,i,j)
+                m3 = recons_metrics(3,i,j)
+                im1 = i-1; ip1 = i+1
+                jm1 = j-1; jp1 = j+1
+                do itr = 1, ntrac
+                   if (.not. llimiter(itr)) cycle
+                   !rck combined min/max and unrolled inner loop
+                   !minval_patch = MINVAL(fcube(i-1:i+1,j-1:j+1))
+                   !maxval_patch = MAXVAL(fcube(i-1:i+1,j-1:j+1))
+                   minval_patch = min( min( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), &
+                                       min( fcube(im1,j  ,k,itr), fcube(i ,j  ,k,itr), fcube(ip1,j  ,k,itr) ), &
+                                       min( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) )
+                   maxval_patch = max( max( fcube(im1,jm1,k,itr), fcube(i ,jm1,k,itr), fcube(ip1,jm1,k,itr) ), &
+                                       max( fcube(im1,j  ,k,itr), fcube(i ,j  ,k,itr), fcube(ip1,j  ,k,itr) ), &
+                                       max( fcube(im1,jp1,k,itr), fcube(i ,jp1,k,itr), fcube(ip1,jp1,k,itr) ) )
+                   min_phi=1.0_r8
+
+                   f0  = fcube(i,j,k,itr)
+                   min_val = f0; max_val = f0!initialize min/max
+                   !
+                   ! compute min/max value at cell corners
+                   !DIR$ SIMD
+                   do vertex=1,4
+                      val = f0
+                      do itmp1=1,irecons-1
+                         val = val + recons(itmp1+1,i,j,itr)*vertex_recons_weights(vertex,itmp1,i,j)
+                      enddo
+                      min_val = min(min_val,val)
+                      max_val = max(max_val,val)
                    enddo
-                endif
-                !rck - combined min/max calculation and unrolled
-                !           max_val = MAXVAL(extrema_value)
-                !           min_val = MINVAL(extrema_value)
-                max_val = extrema_value(13)
-                min_val = extrema_value(13)
-                do itmp1 = 1,12,4
-                   max_val = max(max_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3))
-                   min_val = min(min_val, extrema_value(itmp1),extrema_value(itmp1+1),extrema_value(itmp1+2),extrema_value(itmp1+3))
-                enddo
-                !rck
-
-                if (max_val>maxval_patch.and.abs(max_val-fcube(i,j))>threshold) then
-                   phi = (maxval_patch-fcube(i,j))/(max_val-fcube(i,j))
-                   if (phi<min_phi) min_phi=phi
-                end if
-                if (min_val<minval_patch.and.abs(min_val-fcube(i,j))>threshold) then
-                   phi = (minval_patch-fcube(i,j))/(min_val-fcube(i,j))
-                   if (phi<min_phi) min_phi=phi
-                end if
-                recons(2:6,i,j)=min_phi*recons(2:6,i,j)
+                   r2 = recons(2,i,j,itr);  r3 = recons(3,i,j,itr)
+                   r4 = recons(4,i,j,itr);  r5 = recons(5,i,j,itr);  r6 = recons(6,i,j,itr)
+                   ! Check if the quadratic is minimized within the element
+                   ! Extrema in the interior of the element (there might be just one candidate)
+                   ! DO NOT NEED ABS here, if disc<0 we have a saddle point (no maximum or minimum)
+                   disc = 4.0_r8 * r4 *r5 - r6*r6
+                   if (abs(disc) > threshold) then
+!not b4b                      ex1 = (r6*r3 - 2.0_r8*r5*r2) / disc + scx
+!not b4b                      ex2 = (r6*r2 - 2.0_r8*r4*r3) / disc + scy
+                      ex1 = (r6*r3 - 2.0_r8*r5*r2)
+                      ex2 = (r6*r2 - 2.0_r8*r4*r3)
+                      disc = 1 /disc
+                      ex1 = ex1*disc+scx
+                      ex2 = ex2*disc+scy
+                     if ( ex1 > xminmax(1)-threshold .and. ex1 < xminmax(2)+threshold .and. &
+                           ex2 > yminmax(1)-threshold .and. ex2 < yminmax(2)+threshold ) then
+                         dx = ex1 - scx; dy = ex2 - scy
+                         v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+                         max_val = max(max_val, v1)
+                         min_val = min(min_val, v1)
+                      end if
+                   endif
+                   !
+                   ! Check all potential minimizer points along element boundaries
+                   !
+                   if (abs(r6) > threshold) then
+                      invtmp = 1.0_r8 / (r6 + scy)
+                      do n=1,2
+                         ! Left edge, intercept with du/dx = 0
+                         ex2 = invtmp * (-r2 - 2.0_r8 * r4 * (xminmax(n) - scx))
+                         if ((ex2 > yminmax(1)-threshold) .and. (ex2 < yminmax(2)+threshold)) then
+                            dx = xminmax(n) - scx; dy = ex2 - scy
+                            v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+                            max_val = max(max_val, v1)
+                            min_val = min(min_val, v1)
+                         endif
+                      enddo
+                      !
+                      ! Top/bottom edge, intercept with du/dy = 0
+                      !
+                      invtmp = 1.0_r8 / r6 + scx
+                      do n = 1,2
+                         ex1 = invtmp * (-r3 - 2.0_r8 * r5 * (yminmax(n) - scy))
+                         if ( (ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold) ) then
+                            dx = ex1 - scx; dy = yminmax(n) - scy
+                            v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+                            max_val = max(max_val, v1)
+                            min_val = min(min_val, v1)
+                         endif
+                      enddo
+                   endif
+                   !
+                   ! Top/bottom edge, y=const., du/dx=0
+                   !
+                   if (abs(r4) > threshold) then
+                      invtmp = 1.0_r8 / (2.0_r8 * r4)! + spherecentroid(1,i,j)
+                      do n = 1,2
+                         ex1 = scx+invtmp * (-r2 - r6 * (yminmax(n) - scy))
+                         if ((ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold)) then
+                            dx = ex1 - scx; dy = yminmax(n) - scy
+                            v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+                            max_val = max(max_val, v1)
+                            min_val = min(min_val, v1)
+                         endif
+                      enddo
+                   endif
+                   !
+                   ! Top/bottom edge, y=const., du/dx=0
+                   !
+                   if (abs(r5) > threshold) then
+                      invtmp = 1.0_r8 / (2.0_r8 * r5)! + spherecentroid(1,i,j)
+                      do n = 1,2
+                         ex1 = scy+invtmp * (-r3 - r6 * (xminmax(n) - scx))
+                         if ((ex1 > yminmax(1)-threshold) .and. (ex1 < yminmax(2)+threshold)) then
+                            dx = xminmax(n) - scx; dy = ex1 - scy
+                            v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
+                            max_val = max(max_val, v1)
+                            min_val = min(min_val, v1)
+                         endif
+                      enddo
+                   endif
+                   !
+                   if (max_val>maxval_patch.and.abs(max_val-fcube(i,j,k,itr))>threshold) then
+                      phi = (maxval_patch-fcube(i,j,k,itr))/(max_val-fcube(i,j,k,itr))
+                      if (phi<min_phi) min_phi=phi
+                   end if
+                   if (min_val<minval_patch.and.abs(min_val-fcube(i,j,k,itr))>threshold) then
+                      phi = (minval_patch-fcube(i,j,k,itr))/(min_val-fcube(i,j,k,itr))
+                      if (phi<min_phi) min_phi=phi
+                   end if
+                   recons(2:6,i,j,itr)=min_phi*recons(2:6,i,j,itr)
+                end do
              end do
           end do
        end do
@@ -550,8 +567,6 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,irecons,recons,spherecentroid,re
 
   end subroutine slope_limiter
 
-
-
   ! ----------------------------------------------------------------------------------!
   !SUBROUTINE RECONS_VAL_CART-----------------------------------------------CE-for FVM!
   ! AUTHOR: CHRISTOPH ERATH, 30.November 2011                                         !
@@ -587,7 +602,6 @@ subroutine recons_val_cart(fcube, cartx, carty, centroid, pre_computed_metrics,
          recons(5) * (pre_computed_metrics(2) + dy*dy) + &
          recons(6) * (pre_computed_metrics(3) + dx*dy)
   END subroutine recons_val_cart
-
     !DIR$ ATTRIBUTES FORCEINLINE :: recons_val_cart_plm
     subroutine recons_val_cart_plm(fcube, cartx, carty, centroid, recons, value)
     implicit none
@@ -642,7 +656,6 @@ subroutine slopelimiter_val(value, cell_value, local_min, local_max, min_phi)
     endif
   end subroutine slopelimiter_val
   !END SUBROUTINE SLOPELIMITER_VAL------------------------------------------CE-for FVM!
-
   !DIR$ ATTRIBUTES FORCEINLINE :: dotproduct
   function dotproduct(w,f,ns)
     implicit none

From a5eab471abafa2435da0bc68e38bc3ffd8b55169 Mon Sep 17 00:00:00 2001
From: Peter Hjort Lauritzen <pel@ucar.edu>
Date: Mon, 24 Nov 2025 06:48:50 -0700
Subject: [PATCH 7/9] remove buggy code (answer changes!)

---
 .../se/dycore/fvm_reconstruction_mod.F90      | 35 +++----------------
 1 file changed, 4 insertions(+), 31 deletions(-)

diff --git a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90 b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
index fd3dbe4716..8b425042be 100644
--- a/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+++ b/src/dynamics/se/dycore/fvm_reconstruction_mod.F90
@@ -472,8 +472,6 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,sphe
                    ! DO NOT NEED ABS here, if disc<0 we have a saddle point (no maximum or minimum)
                    disc = 4.0_r8 * r4 *r5 - r6*r6
                    if (abs(disc) > threshold) then
-!not b4b                      ex1 = (r6*r3 - 2.0_r8*r5*r2) / disc + scx
-!not b4b                      ex2 = (r6*r2 - 2.0_r8*r4*r3) / disc + scy
                       ex1 = (r6*r3 - 2.0_r8*r5*r2)
                       ex2 = (r6*r2 - 2.0_r8*r4*r3)
                       disc = 1 /disc
@@ -490,37 +488,12 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,sphe
                    !
                    ! Check all potential minimizer points along element boundaries
                    !
-                   if (abs(r6) > threshold) then
-                      invtmp = 1.0_r8 / (r6 + scy)
-                      do n=1,2
-                         ! Left edge, intercept with du/dx = 0
-                         ex2 = invtmp * (-r2 - 2.0_r8 * r4 * (xminmax(n) - scx))
-                         if ((ex2 > yminmax(1)-threshold) .and. (ex2 < yminmax(2)+threshold)) then
-                            dx = xminmax(n) - scx; dy = ex2 - scy
-                            v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
-                            max_val = max(max_val, v1)
-                            min_val = min(min_val, v1)
-                         endif
-                      enddo
-                      !
-                      ! Top/bottom edge, intercept with du/dy = 0
-                      !
-                      invtmp = 1.0_r8 / r6 + scx
-                      do n = 1,2
-                         ex1 = invtmp * (-r3 - 2.0_r8 * r5 * (yminmax(n) - scy))
-                         if ( (ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold) ) then
-                            dx = ex1 - scx; dy = yminmax(n) - scy
-                            v1 = f0 + r2*dx + r3*dy + r4*(m1+dx*dx) + r5*(m2+dy*dy) + r6*(m3+dx*dy)
-                            max_val = max(max_val, v1)
-                            min_val = min(min_val, v1)
-                         endif
-                      enddo
-                   endif
+
                    !
                    ! Top/bottom edge, y=const., du/dx=0
                    !
                    if (abs(r4) > threshold) then
-                      invtmp = 1.0_r8 / (2.0_r8 * r4)! + spherecentroid(1,i,j)
+                      invtmp = 1.0_r8 / (2.0_r8 * r4)
                       do n = 1,2
                          ex1 = scx+invtmp * (-r2 - r6 * (yminmax(n) - scy))
                          if ((ex1 > xminmax(1)-threshold) .and. (ex1 < xminmax(2)+threshold)) then
@@ -532,10 +505,10 @@ subroutine slope_limiter(nhe,nc,nhc,fcube,jx,jy,k,nlev,ntrac,irecons,recons,sphe
                       enddo
                    endif
                    !
-                   ! Top/bottom edge, y=const., du/dx=0
+                   ! Left/right edge, x=const., du/dy=0
                    !
                    if (abs(r5) > threshold) then
-                      invtmp = 1.0_r8 / (2.0_r8 * r5)! + spherecentroid(1,i,j)
+                      invtmp = 1.0_r8 / (2.0_r8 * r5)
                       do n = 1,2
                          ex1 = scy+invtmp * (-r3 - r6 * (xminmax(n) - scx))
                          if ((ex1 > yminmax(1)-threshold) .and. (ex1 < yminmax(2)+threshold)) then

From 7b690fe3d6985ea9384ae288af491b1d12523e3b Mon Sep 17 00:00:00 2001
From: Jesse Nusbaumer <nusbaume@ucar.edu>
Date: Tue, 25 Nov 2025 09:36:01 -0700
Subject: [PATCH 8/9] Remove dead config code originally used by Eulerian
 dycore.

---
 cime_config/config_pes.xml | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/cime_config/config_pes.xml b/cime_config/config_pes.xml
index fe5db44ecf..4d6beab0ba 100644
--- a/cime_config/config_pes.xml
+++ b/cime_config/config_pes.xml
@@ -761,43 +761,6 @@
       </pes>
     </mach>
   </grid>
-  <grid name="a%T31">
-    <mach name="hera|sierra" >
-      <pes pesize="any" compset="any">
-	<comment>none</comment>
-	<ntasks>
-	  <ntasks_atm>64</ntasks_atm>
-	  <ntasks_lnd>64</ntasks_lnd>
-	  <ntasks_rof>64</ntasks_rof>
-	  <ntasks_ice>64</ntasks_ice>
-	  <ntasks_ocn>64</ntasks_ocn>
-	  <ntasks_glc>64</ntasks_glc>
-	  <ntasks_wav>64</ntasks_wav>
-	  <ntasks_cpl>64</ntasks_cpl>
-	</ntasks>
-	<nthrds>
-	  <nthrds_atm>1</nthrds_atm>
-	  <nthrds_lnd>1</nthrds_lnd>
-	  <nthrds_rof>1</nthrds_rof>
-	  <nthrds_ice>1</nthrds_ice>
-	  <nthrds_ocn>1</nthrds_ocn>
-	  <nthrds_glc>1</nthrds_glc>
-	  <nthrds_wav>1</nthrds_wav>
-	  <nthrds_cpl>1</nthrds_cpl>
-	</nthrds>
-	<rootpe>
-	  <rootpe_atm>0</rootpe_atm>
-	  <rootpe_lnd>0</rootpe_lnd>
-	  <rootpe_rof>0</rootpe_rof>
-	  <rootpe_ice>0</rootpe_ice>
-	  <rootpe_ocn>0</rootpe_ocn>
-	  <rootpe_glc>0</rootpe_glc>
-	  <rootpe_wav>0</rootpe_wav>
-	  <rootpe_cpl>0</rootpe_cpl>
-	</rootpe>
-      </pes>
-    </mach>
-  </grid>
   <grid name="a%1.9x2.5">
     <mach name="derecho">
       <pes pesize='any' compset='any'>

From 85a3126ddb9ae2111bf2c40271b89d34ae6717e7 Mon Sep 17 00:00:00 2001
From: Cheryl Craig <cacraig@ucar.edu>
Date: Wed, 26 Nov 2025 11:36:30 -0700
Subject: [PATCH 9/9] Update ChangeLog for cam6_4_131

---
 doc/ChangeLog | 65 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/doc/ChangeLog b/doc/ChangeLog
index 5317e0dba9..897856ed6d 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,13 +1,13 @@
 ===============================================================
 
-Tag name: cam6_
-Originator(s): johnmauff, pel
-Date: April 28, 2025
-One-line Summary:
-Github PR URL: https://github.com/ESCOMP/CAM/pull/
+Tag name: cam6_4_131
+Originator(s): johnmauff, pel, cacraig, nusbaume
+Date: Nov 26, 2025
+One-line Summary: Performance improvements for CSLAM
+Github PR URL: https://github.com/ESCOMP/CAM/pull/1365
 
 Purpose of changes (include the issue number and title text for each relevant GitHub issue):
-   Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1316
+   Excessive data movement in extend_panel_interpolate (CSLAM): https://github.com/ESCOMP/CAM/issues/1360
 
    The subroutine extend_panel_interpolate is written such that the compiler will generate more data movement than is necessary.
    This excessive data movement intensifies a computational load imbalance in the CSLAM advection. While it is impossible to eliminate
@@ -22,7 +22,7 @@ List any changes to the defaults for the boundary datasets: N/A
 
 Describe any substantial timing or memory changes: N/A
 
-Code reviewed by:
+Code reviewed by: nusbaume
 
 List all files eliminated: N/A
 
@@ -30,10 +30,57 @@ List all files added and what they do: N/A
 
 List all existing files that have been modified, and describe the changes:
 
-....
+M       cime_config/config_pes.xml
+          - remove dead config code originally used by Eulerian dycore
+
+M       src/dynamics/se/dycore/fvm_consistent_se_cslam.F90
+M       src/dynamics/se/dycore/fvm_reconstruction_mod.F90
+          - mods for SE dycore as described above
+
+If there were any failures reported from running test_driver.sh on any test
+platform, and checkin with these failures has been OK'd by the gatekeeper,
+then copy the lines from the td.*.status files for the failed tests to the
+appropriate machine below.  All failed tests must be justified.
+
+derecho/intel/aux_cam:
+  SMS_D_Ln9_P1536x1.ne0CONUSne30x8_ne0CONUSne30x8_mt12.FCHIST.derecho_intel.cam-outfrq9s (Overall: FAIL) details:
+      - intermittent failure in CTSM code (lnd_set_decomp_and_domain.F90)
+
+  ERC_D_Ln9.ne30pg2_ne30pg2_mt232.QPC7.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+  ERC_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_LTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+  ERI_D_Ln18.ne16pg3_ne16pg3_mt232.FHIST_C4.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details:
+  ERI_D_Ln18.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq3s_eri (Overall: DIFF) details:
+  ERP_D_Ln9.ne30pg3_ne30pg3_mt232.F1850C_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+  ERP_Ld3.ne16pg3_ne16pg3_mg17.FHISTC_WAt1ma.derecho_intel.cam-reduced_hist1d (Overall: DIFF) details:
+  ERP_Ld3.ne30pg3_ne30pg3_mt232.FHISTC_MTt4s.derecho_intel.cam-outfrq1d_aoa (Overall: DIFF) details:
+  ERP_Ln9.ne30pg3_ne30pg3_mg17.FCnudged.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+  ERP_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WAma.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+  ERR_Ln9.ne16pg3_ne16pg3_mt232.FHISTC_LTso.derecho_intel.cam-outfrq9s_bwic (Overall: DIFF) details:
+  ERS_Ln9.ne30pg3_ne30pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s_ctem (Overall: DIFF) details:
+  SMS_C2_D_Ln9.ne16pg3_ne16pg3_mg17.FHISTC_WXma.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+  SMS_D_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_MTso.derecho_intel.cam-outfrq9s (Overall: DIFF) details:
+  SMS_D_Ln9_P1280x1.ne30pg3_ne30pg3_mt232.FHISTC_MTt1s.derecho_intel.cam-outfrq9s_Leung_dust (Overall: DIFF) details:
+  SMS_Ld1.ne30pg3_ne30pg3_mg17.FC2010climo.derecho_intel.cam-outfrq1d (Overall: DIFF) details:
+  SMS_Ln9.ne30pg3_ne30pg3_mg17.FW2000climo.derecho_intel.cam-outfrq9s_rrtmgp (Overall: DIFF) details:
+      - answer differences for CSLAM runs
+
+derecho/nvhpc/aux_cam:
+  ERS_Ln9.ne30pg3_ne30pg3_mt232.FHISTC_LTso.derecho_nvhpc.cam-outfrq9s_gpu_default (Overall: FAIL) details:
+     - timing issue - Jian has determined this is due to changes on derecho with the the last upgrade.  He has
+                      reported the issue to CISL.  Note cam6_4_128 is the last CAM tag with baselines to use for comparison
+                      but answer changes are expected starting with cam6_4_130
+
+izumi/nag/aux_cam:
+  ERC_D_Ln27.ne3pg3_ne3pg3_mt232.FKESSLER.izumi_nag.cam-outfrq9s (Overall: DIFF) details:
+  ERC_D_Ln9.ne3pg3_ne3pg3_mt232.FHISTC_LTso.izumi_nag.cam-cosp_rad_diags (Overall: DIFF) details:
+  SMS_D_Ln3.ne5pg3_ne5pg3_mg37.QPX2000.izumi_nag.cam-outfrq3s (Overall: DIFF) details:
+      - answer differences for CSLAM runs
+
+
+izumi/gnu/aux_cam: all BFB
 
 Summarize any changes to answers:
-	No answer changes, all b4b
+	Answer changing, bug not climate changing as reported by pel
 
 ===============================================================