From e5864ff868a6357fd34bceafee60e0979a6c6bbc Mon Sep 17 00:00:00 2001
From: Lucas Esclapez <13371051+esclapez@users.noreply.github.com>
Date: Fri, 28 Apr 2023 11:44:36 -0700
Subject: [PATCH] Add load balancing control (#198)

* Redo load balancing (only).

* Update submodules.

* Describe additional derived vars.

* Add a brief description of the code base.

* A few fixes.

* Add control doc.

* Trailing whitespaces.

* Use LMeX version of the diagnostics only if requested.

* Add a couple of details to address Bruce's comments.
---
 CMakeLists.txt                        |   1 +
 Docs/source/manual/Implementation.rst | 197 ++++++++++++++++
 Docs/source/manual/LMeXControls.rst   |  31 ++-
 Docs/source/manual/Model.rst          |   9 +-
 Docs/source/manual/index.rst          |   1 +
 Source/CMakeLists.txt                 |   4 +-
 Source/PeleLM.H                       | 135 ++++++++---
 Source/PeleLMBC.cpp                   |  52 +++++
 Source/PeleLMDeriveFunc.H             |   5 +
 Source/PeleLMDeriveFunc.cpp           |  19 ++
 Source/PeleLMInit.cpp                 |   6 +
 Source/PeleLMReactions.cpp            |   3 +-
 Source/PeleLMRegrid.cpp               | 317 +++++++++++++++++++++++++-
 Source/PeleLMSetup.cpp                |  29 ++-
 Source/PeleLMUserKeys.H               | 168 ++++++++++++++
 Source/PeleLMUtils.cpp                |  94 +++++++-
 Submodules/AMReX-Hydro                |   2 +-
 Submodules/PelePhysics                |   2 +-
 Submodules/amrex                      |   2 +-
 19 files changed, 1027 insertions(+), 50 deletions(-)
 create mode 100644 Docs/source/manual/Implementation.rst
 create mode 100644 Source/PeleLMUserKeys.H

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57ddcd5c..a67215ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ option(PELELMEX_USE_EB        "Enable Embedded Boundary"     OFF)
 option(PELELMEX_USE_EFIELD    "Enable Electric field module" OFF)
 option(PELELMEX_USE_PARTICLES "Enable Spray module"          OFF)
 option(PELELMEX_USE_SOOT      "Enable Soot module"           OFF)
+option(PELELMEX_USE_DIAG      "Enable LMeX Diagn module"     OFF)
 
 #
 # Misc options
diff --git a/Docs/source/manual/Implementation.rst b/Docs/source/manual/Implementation.rst
new file mode 100644
index 00000000..760c1ea5
--- /dev/null
+++ b/Docs/source/manual/Implementation.rst
@@ -0,0 +1,197 @@
+.. highlight:: rst
+
+.. _sec:code:
+
+Source code
+===========
+
+The following provides an overview of *PeleLMeX* source code, basic information on the data structure and is
+useful for any user intending to use and/or do some development in the code.
+
+Overview of source code
+-----------------------
+
+*PeleLMeX* is based upon AMReX's `AmrCore <https://amrex-codes.github.io/amrex/docs_html/AmrCore.html>`_ from which it inherits
+an AMR hierarchy data structure and basic regridding functionnalities. The code is entirely written in C++, with low level
+compute-intensive kernels implemented as lambda functions to seamlessly run on CPU and various GPU backends through AMReX
+high performance portatbility abstraction.
+
+The core of the algorithm is implementation in the ``advance()`` function which acts on all the levels concurrently.
+Projection operators and advection scheme functions are imported the `AMReX-Hydro library <https://amrex-codes.github.io/AMReX-Hydro>`_
+while the core of the thermo-chemistry functionalities comes from `PelePhysics <https://amrex-combustion.github.io/PelePhysics/>`_ .
+Users are responsible for providing initial and boundary conditions in the local subfolder implementing their case, i.e. it is
+not possible to compile and run *PeleLMeX* without actually writting a few lines of codes. However, numerous example are provided
+in ``Exec/RegTests`` from which new users can pull for their new case.
+
+The source code contains a few dozen files, organized around the pieces of the algorithm and major functionalities:
+
+* ``PeleLMEvolve``: top level time advance loop, with IO/exit controls
+* ``PeleLMSetup``: setting up the simulation parameters, parsing the input file
+* ``PeleLMInit``: generating the initial solution from scratch or checkpoint file, performing initial projections/iteration(s)
+* ``PeleLMAdvance``: top level implementation of the time step algorithm
+* ``PeleLMProjection``: implement the various flavors of the nodal projection
+* ``PeleLMUmac``: implement the construction and projection of MAC-velocities
+* ``PeleLMAdvection``: functions to compute the explicit advection terms
+* ``PeleLMDiffusion``: functions to compute the diffusion terms, using the operators defined in ``DiffusionOp``
+* ``PeleLMReaction``: function using *PelePhysics* reactors to integrate the chemistry and linearized advection/diffusion
+* ``PeleLMPlot``: implementation of plotfile and checkpoint file IOs
+* ``PeleLMBC``: functions filling the ghost cells (at fine/fine, coarse/fine and domain boundaries)
+* ``PeleLMRegrid``: creating new AMR level or remaking modified AMR level during adaptive refinement
+* ``PeleLMTagging``: mark cells for refinement
+* ``PeleLM_K.H``: low-level kernel functions
+
+Data structure and containers
+-----------------------------
+
+AMReX 101
+^^^^^^^^^
+
+The basic AMReX`s data structure is the `MultiFab <https://amrex-codes.github.io/amrex/docs_html/Basics.html#fabarray-multifab-and-imultifab>`_
+(historically, multi Fortran Array Box (FAB)).
+Within the block-structured AMR approach of AMReX, the domain is decomposed into non-overlapping rectangular `boxes`,
+which can be assembled into a `boxArray`. Each AMR level has a `boxArray` providing the list of `boxes` of that level.
+The `boxes` are distributed accross the MPI ranks, the mapping of which is described by a `DistributionMap`. Given a
+`boxArray` and a `DistributionMap`, one can define an actual data container (`boxes` are only lightweight descriptor
+of the geometrical rectangular object, containing bounds and centering information only), where each rank will
+allocate a FAB for the boxes it owns in the `boxArray`, resulting in a collection of FABs or a MultiFab, distributed
+accross the MPI ranks.
+
+To access the data in a MultiFab, one uses a `MFIter <https://amrex-codes.github.io/amrex/docs_html/Basics.html#mfiter-and-tiling>`_
+(or MultiFab iterator), which provides each MPI rank access to the FABs it owns within the MultiFab. Actual access to the data in
+memory is then provided by the lightweight `Array4` structure and it is strongly advised to rely on AMReX
+`ParallelFor <https://amrex-codes.github.io/amrex/docs_html/Basics.html#parallelfor>`_ function template to loop through the logical `i,j,k` indexes.
+For example, to set the velocity data stored in a MultiFab called `NewState` with data from an `OldState` and an increment
+from a third MultiFab `advTerm`: ::
+
+    for (MFIter mfi(State,TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+        Box const& bx = mfi.tilebox();
+        auto const& velo_old = OldState.const_array(mfi,VELOCITY_INDEX);
+        auto const& incr = advTerm.const_array(mfi);
+        auto const& velo_new = NewState.array(mfi,VELOCITY_INDEX);
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            velo_new(i,j,k) = velo_old(i,j,k) + incr(i,j,k);
+        });
+    }
+
+.. note::
+   For the example above to function, all three MultiFabs must have the same `boxArray` and `DistributionMap`
+
+Users are strongly encouraged to review of the content of `AMReX documentation <https://amrex-codes.github.io/amrex/docs_html/Basics.html>`_
+to get more familiar with AMReX data structures and environment.
+
+*PeleLMeX* state and advance containers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The state vector of *PeleLMeX* contains the 2 or 3 components of velocity, the mixture density, species density (rhoYs),
+rhoH, temperature and the thermodynamic pressure. The state components are stored in a cell-centered MultiFab with
+`NVAR` components. Additionnally, the perturbational pressure stored at the nodes is contained in a separate MultiFab.
+Together with the cell-centered pressure gradient, the cell-centered divergence constraint and cell-centered
+transport properties, these MultiFabs are assembled into a `LevelData` struct.
+
+Each level in the AMR hierarchy have two versions of the `LevelData` at any point during the simulation: one
+for the old state and one for the new state. The developer can get a pointer to the `LevelData` struct by
+calling : ::
+
+    auto ldata_p = getLevelDataPtr(lev,AmrOldTime);
+
+with either `AmrOldTime` or `AmrNewTime` on level `lev`. Additionnally, calling this function with
+`AmrHalfTime` with return a `LevelData` struct whose `state` is a linearly interpolated between the old and new
+states (but the other MultiFab in `LevelData` are empty !).
+It is also often useful to have access to a vector of a state component accross the entire AMR hierarchy. To do so, *PeleLMeX*
+provides a set of functions returning a vector of MultiFab `std::unique_ptr` aliased into the `LevelData`
+MultiFab on each level: ::
+
+    getStateVect(time);         # Return the entire state (ncomp: NVAR)
+    getVelocityVect(time);      # Return the velocity only (ncomp: AMREX_SPACEDIM)
+    getDensityVect(time);       # Return the mixture density (ncomp: 1)
+    getSpeciesVect(time);       # Return the species density (ncomp: NUM_SPECIES)
+    getRhoHVect(time);          # Return rhoH (ncomp: 1)
+    getTempVect(time);          # Return temperature (ncomp: 1)
+    getDivUVect(time);          # Return divergence constraint (ncomp: 1)
+    getDiffusivityVect(time);   # Return diffusivity (ncomp: NUM_SPECIES+2)
+    getViscosityVect(time);     # Return viscosity (ncomp: 1)
+
+where ``time`` can either be `AmrOldTime` or `AmrNewTime`.
+Also available at any point during the simulation is the `LevelDataReact` which contains the species
+chemical source terms. A single version of the container is avaible on each level and can be accessed
+using: ::
+
+    auto ldataR_p = getLevelDataReactPtr(lev);
+
+Within the time-advance function, the *PeleLMeX* algorithm calls for the computation of the advection,
+diffusion and reaction source terms iteratively using SDC. At each step, the results of other steps
+can be used as part of the numerical scheme (e.g. the explicit advection with a Godunov scheme uses
+the diffusion term). These temporary variables, only useful in the scope of the advance function, are
+assembled into two structs: ``AdvanceDiffData`` and ``AdvanceAdvData``. The former contains three
+MultiFabs for the separate diffusion term evaluations described in :numref:`LMeX_Algo`: :math:`D^n`,
+:math:`D^{n+1,k}` and :math:`D^{n+1,k+1}`, as well as additional containers for the :math:`\overline{W}`
+and Soret contributions. The later encapsulate the face-centered MAC velocities :math:`U_{ADV}`, the
+advection term :math:`A_{n+1/2,(k+1)}`, the pressure correction :math:`\chi` and a forcing container
+used in the RHS of advection/diffusion/reaction solves. In contrast with the `LevelData`, these two containers
+are freed at the end of the advance function, and are passed around in the functions called in `advance()`.
+
+Parallelism
+-----------
+
+*PeleLMeX* inherits the MPI+X approach from the AMReX library, where X can be any of CUDA, HIP or SYCL
+(Open-MP is currently only available for non-reacting/explicit chemistry integrators) for heterogeneous architectures.
+The reader is referred to `AMReX GPU documentation <https://amrex-codes.github.io/amrex/docs_html/GPU.html>`_ for more details on
+the thread parallelism.
+
+As mentioned above, the top-level spatial decomposition arises from AMReX's block-structured approach. On each level, non-overlapping
+`boxes` are assembled into `boxArray` and distributed accross MPI rank with `DistributionMap` (or `DMap`).
+It is in our best interest to ensure that all the MultiFab in the code use the same `boxArray` and `DMap`,
+such that operation using `MFIter` can be performed and data copy accross MPI ranks is minimized.
+However, it is also important to maintain a good load balancing, i.e. ensure that each MPI rank has the same amount
+of work, to avoid wasting computational ressource. Reactive flow simulation are challenging, because the chemistry
+integration is very spatially heterogeneous, with stiff ODE integration required within the flame front and non-stiff
+integration of the linearized advection/diffusion required in the cold gases or burnt mixture. Additionnally, because
+a non-subcycling approach is used in *PeleLMeX*, the chemistry doesn't have to be integrated in fine-covered region.
+Two `boxArray` and associated `DMap` are thus available in *PeleLMeX*:
+
+1. The first one is inherited from `AmrCore` and is availble as ``grid[lev]`` (`boxArray`) and ``dmap[lev]`` (`DMap`) throughout the code. Most
+   of *PeleLMeX* MultiFabs use these two, and the `boxes` sizes are dictated by the `amr.max_grid_size` and `amr.blocking_factor` from the input
+   file. These are employed for all the operations in the code except the chemistry integration. The default load balancing approach is to use
+   space curve filling (SCF) with each box weighted by the number of cells in each box. Advanced users can try alternate appraoch using the
+   keys listed in :doc:`LMeXControls`.
+2. A second one is created, masking fine-covered regions and updated during regrid operations. It is used to perform the chemistry integration,
+   and because this is a purely local integration (in contrast with implicit diffusion solve for instance, which require communications
+   to solve the linear problem using GMG), a Knapsack load balancing approach is used by default, where the weight of each box is based
+   on the total number of chemistry RHS calls in the box. The size of the `boxes` in the chemistry `boxArray` (accessible with ``m_baChem[lev]``)
+   is controled by the `peleLM.max_grid_size_chem` in the input file. Once again, advanced users can try alternate approaches to load
+   balancing the chemistry `DMap` using the keys described in :doc:`LMeXControls`.
+
+After each regrid operation, even if the grids did not actually change, *PeleLMeX* will try to find a better load balancing for the
+`AmrCore` `DMap`. Because changing the load balancing requires copying data accross MPI ranks, we only want to change the `DMap`
+only if a significantly better new `DMap` can be obtained, with the threshold for a better `DMap` defined based on the value of
+`peleLM.load_balancing_efficiency_threshold`.
+
+Debugging
+---------
+
+The first step to debug anyh addition or undefined behavior of *PeleLMeX* is to turn the ``DEBUG`` flag ``ON`` in the
+GNUmakefile and activate AMReX`s floating point exception traps in the input file: ::
+
+    amrex.fpe_trap_invalid = 1
+    amrex.fpe_trap_zero = 1
+    amrex.fpe_trap_overflow = 1
+
+This will slow down the code considerably, but will enable bound checks on all AMReX low-level data structure,
+catch floating point errors (using nans, dividing by zero, ...) and any ``AMREX_ASSERT`` statement added to the
+code base. It is also often useful to visualize data in order to understand the erroneous results the solver can
+return. Developers can write to disk a single MultiFab using AMReX `VisMF`: ::
+
+    VisMF::Write(myMF,"VisMyMF");
+
+and can be visualized with `Amrvis` using `amrvis -mf`. Alternatively, visualizing the entire AMR hierarchy is also
+useful. *PeleLMeX* provides a simple function to write a vector of MultiFab: ::
+
+    WriteDebugPlotFile(GetVecOfConstPtrs(getTempVect()),"TempDebug");
+
+which can be opened with `Amrvis` or any other visualization software. This last function will function providing that
+the MultiFabs in the vector all have the same number of components.
+Finally, another way of checking individual pieces of the algorithm is to use *PeleLMeX* evaluate mode ``peleLM.run_mode=evaluate``
+and specify a list of fields with ``peleLM.evaluate_vars`` as described in :doc:`LMeXControls`. Note that not all of the
+algorithm is available in this mode yet.
diff --git a/Docs/source/manual/LMeXControls.rst b/Docs/source/manual/LMeXControls.rst
index 8eca64cf..791b590a 100644
--- a/Docs/source/manual/LMeXControls.rst
+++ b/Docs/source/manual/LMeXControls.rst
@@ -52,6 +52,27 @@ Grid/AMR parameters
     amr.blocking_factor = 16               # block factor in grid generation (min box size)
     amr.max_grid_size   = 64               # max box size
 
+    peleLM.max_grid_size_chem = 32         # [OPT, DEF="None"] Max box size for the Chemistry BoxArray
+
+Load balancing
+--------------
+
+*PeleLMeX* relies on two distribution maps (see :doc:`Implementation` for more details).
+
+::
+
+    peleLM.do_load_balancing = 1                    # [OPT, DEF=0] Activate load balancing
+    peleLM.load_balancing_method = sfc              # [OPT, DEF="sfc"] AmrCore dmap load balancing method
+    peleLM.load_balancing_cost_estimate = ncell     # [OPT, DEF="ncell"] AmrCore dmap balancing cost
+    peleLM.chem_load_balancing_method = knapsack    # [OPT, DEF="knapsack"] Chemistry dmap load balancing method
+    peleLM.chem_load_balancing_cost_estimate = chemfunctcall_sum # [OPT, DEF="chemfunctcall_sum"] Chemistry dmap balancing cost
+    peleLM.load_balancing_efficiency_threshold = 1.05  # What constitute a better dmap ?
+
+The balancing method can be one of `sfc`, `roundrobin` or `knapsack`, while the cost estimate can be one of
+`ncell`, `chemfunctcall_avg`, `chemfunctcall_max`, `chemfunctcall_sum`, `userdefined_avg` or `userdefined_sum`. When
+using either of the last to option, the user must provide a definition for the `derUserDefined`. If multiple components
+are defined in the `derUserDefined` function, the first one is used for load balancing.
+
 Time stepping parameters
 ------------------------
 
@@ -125,7 +146,6 @@ overview of the available controls:
 The `field_name` can be any of the state or derived variables (see below) component. Additional controls specific
 to embedded boundaries are discussed below.
 
-
 PeleLMeX derived variables
 --------------------------
 
@@ -186,8 +206,17 @@ The following list of derived variables are available in PeleLMeX:
     * - `coordinates`
       - AMREX_SPACEDIM
       - Cell-center coordinates
+    * - `DistributionMap`
+      - 1
+      - The MPI-rank of each box
+    * - `derUserDefined`
+      - ?
+      - A user-defined derived which number of components is provided by the user (see below).
 
 Note that `mixture_fraction` and `progress_variable` requires additional inputs from the users as described below.
+The `derUserDefined` allow the user to define its own derived variable which can comprise several components. To do
+so, the user need to copy the Source/DeriveUserDefined.cpp file into his run folder and update the file. The number of
+components is defined based on the size of the vector returned by pelelm_setuserderives().
 
 PeleLMeX algorithm
 ------------------
diff --git a/Docs/source/manual/Model.rst b/Docs/source/manual/Model.rst
index 19bc01ed..09a3813a 100644
--- a/Docs/source/manual/Model.rst
+++ b/Docs/source/manual/Model.rst
@@ -40,7 +40,7 @@ In a nutshell, `PeleLMeX` features include:
 Mathematical background
 -----------------------
 
-`PeleLMeX` evolves chemically reacting low Mach number flows with block-structured adaptive mesh refinement (AMR). The code depends upon the `AMReX <https://github.com/AMReX-Codes/amrex>`_ library to provide the underlying data structures, and tools to manage and operate on them across massively parallel computing architectures. `PeleLMeX` also utilizes the source code and algorithmic infrastructure of `AMReX-Hydro <https://github.com/AMReX-Codes/AMReX-Hydro>`_. `PeleLMeX` borrows heavily from `PeleLM <https://github.com/AMReX-Combustion/PeleLM>`_. The core algorithms in `PeleLM` are described in the following papers:
+`PeleLMeX` evolves chemically reacting low Mach number flows with block-structured adaptive mesh refinement (AMR). The code depends upon the `AMReX <https://github.com/AMReX-Codes/amrex>`_ library to provide the underlying data structures, and tools to manage and operate on them across massively parallel computing architectures. `PeleLMeX` also utilizes the source code and algorithmic infrastructure of `AMReX-Hydro <https://github.com/AMReX-Codes/AMReX-Hydro>`_. `PeleLMeX` borrows heavily from `PeleLM`_. The core algorithms in `PeleLM` are described in the following papers:
 
 * *A conservative, thermodynamically consistent numerical approach for low Mach number combustion. I. Single-level integration*, A. Nonaka, J. B. Bell, and M. S. Day, *Combust. Theor. Model.*, **22** (1) 156-184 (2018)
 
@@ -172,12 +172,15 @@ and simplified velocity constraint,
 PeleLMeX Algorithm
 ------------------
 
-An overview of `PeleLMeX` time-advance function is provided in the figure below and details are provided in the following subsections.
+An overview of `PeleLMeX` time-advance function is provided in :numref:`LMeX_Algo` and details are provided in the following subsections.
 
 .. figure:: images/model/PeleLMeX_Algorithm.png
+   :name: LMeX_Algo
    :align: center
    :figwidth: 50%
 
+   : Flowchart of the *PeleLMeX* advance function.
+
 The three steps of the low Mach number projection scheme described :ref:`below <ssec:projScheme>` are referenced to better
 emphasize how the thermodynamic solve is closely weaved into the fractional step appraoch. Striped boxes indicate where the
 :ref:`Godunov procedure <ssec:advScheme>` is employed while the four different linear solves are highlighted.
@@ -340,7 +343,7 @@ Note that in the presence of EB, only the `Godunov_PLM` variant is available.
 AMR extension
 ^^^^^^^^^^^^^
 
-In contrast with `PeleLM <https://github.com/AMReX-Combustion/PeleLM>`_, `PeleLMeX` do not rely a on subcycling appraoch to advance the AMR hierarchy.
+In contrast with `PeleLM`_, `PeleLMeX` do not rely a on subcycling appraoch to advance the AMR hierarchy.
 This difference is illustrated in the figure below comparing the multi-level time-stepping approach in both codes:
 
 .. figure:: images/model/PeleLMeX_Subcycling.png
diff --git a/Docs/source/manual/index.rst b/Docs/source/manual/index.rst
index 5329542a..271bf139 100644
--- a/Docs/source/manual/index.rst
+++ b/Docs/source/manual/index.rst
@@ -22,6 +22,7 @@ point your web browser at the file ``${PELELMEX_HOME}/Docs/build/html/index.html
    :caption: Theory:
 
    Model.rst
+   Implementation.rst
    Validation.rst
    Performances.rst
 
diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt
index 2ab506fa..c144a5eb 100644
--- a/Source/CMakeLists.txt
+++ b/Source/CMakeLists.txt
@@ -42,7 +42,9 @@ target_sources(pelelmex
         Utils.cpp
     )
 
-add_subdirectory(Diagnostics)
+if (PELELMEX_USE_DIAGS)
+    add_subdirectory(Diagnostics)
+endif ()
 
 if (PELELMEX_USE_EFIELD)
     add_subdirectory(Efield)
diff --git a/Source/PeleLM.H b/Source/PeleLM.H
index 96a96bcd..82e359b1 100644
--- a/Source/PeleLM.H
+++ b/Source/PeleLM.H
@@ -1,29 +1,36 @@
 #ifndef _PeleLM_H_
 #define _PeleLM_H_
 
+// PeleLMeX includes
+#include "PeleLMUserKeys.H"
+#include "PeleLM_Index.H"
+#include "PeleLMDerive.H"
+#include "pelelm_prob_parm.H"
+#include "DiffusionOp.H"
+#include "DiagBase.H"
+#ifdef PELE_USE_EFIELD
+#include "PrecondOp.H"
+#endif
+
+// PelePhysics lib
+#include <mechanism.H>
+#include <PelePhysics.H>
+#include <PMFData.H>
+#include <turbinflow.H>
+#include <ReactorBase.H>
+
+// AMReX-Hydro lib
+#include <hydro_MacProjector.H>
+#include <hydro_NodalProjector.H>
+#include <hydro_godunov_ppm.H>
+
+// AMReX lib
 #include <AMReX_AmrCore.H>
 #include <AMReX_MultiFabUtil.H>
 #include <AMReX_BC_TYPES.H>
-#include <hydro_MacProjector.H>
-#include <hydro_NodalProjector.H>
 #include <AMReX_ErrorList.H>
 #include <AMReX_VisMF.H>
-#include <hydro_godunov_ppm.H>
 
-#include <mechanism.H>
-#include "PelePhysics.H"
-#include "PMFData.H"
-#include "turbinflow.H"
-#include "ReactorBase.H"
-#include <PeleLM_Index.H>
-#include <PeleLMDerive.H>
-#include <pelelm_prob_parm.H>
-#include <DiffusionOp.H>
-#include <DiagBase.H>
-
-#ifdef PELE_USE_EFIELD
-#include <PrecondOp.H>
-#endif
 
 // Forward declarations
 #ifdef PELELM_USE_SPRAY
@@ -45,7 +52,7 @@ class PeleLM : public amrex::AmrCore {
    // constructor
    PeleLM();
 
-   //destructor
+   // destructor
    virtual ~PeleLM();
 
    // Setup function
@@ -105,8 +112,8 @@ class PeleLM : public amrex::AmrCore {
    void readProbParm();
 
    // ReadGridFile
-  void readGridFile(std::string grid_file,
-                    amrex::Vector<amrex::BoxArray>& input_ba);
+   void readGridFile(std::string grid_file,
+                     amrex::Vector<amrex::BoxArray>& input_ba);
 
 #ifdef PELELM_USE_SPRAY
    // ReadSprayParameters
@@ -249,6 +256,10 @@ class PeleLM : public amrex::AmrCore {
    // checkRunParams
    void checkRunParams();
 
+   void computeCosts(int a_lev);
+   void computeCosts(int a_lev, amrex::LayoutData<amrex::Real> &a_costs,
+                     int a_costMethod);
+
    //-----------------------------------------------------------------------------
 
    //-----------------------------------------------------------------------------
@@ -617,6 +628,7 @@ class PeleLM : public amrex::AmrCore {
    void fillpatch_gradp(int lev, amrex::Real a_time, amrex::MultiFab& a_gp, int nGhost);
    void fillpatch_reaction(int lev, amrex::Real a_time, amrex::MultiFab& a_I_R, int nGhost);
    void fillpatch_forces(amrex::Real a_time, amrex::Vector<amrex::MultiFab*> const &a_velForces, int nGrowForce);
+   void fillpatch_chemFunctCall(int lev, amrex::Real a_time, amrex::MultiFab& a_fctC, int nGhost);
 #ifdef PELE_USE_EFIELD
    void fillpatch_phiV(int lev, amrex::Real a_time, amrex::MultiFab& a_phiV, int phiV_comp, int nGhost);
    void fillPatchExtrap(amrex::Real a_time, amrex::Vector<amrex::MultiFab*> const &a_MF, int a_nGrow);
@@ -629,6 +641,7 @@ class PeleLM : public amrex::AmrCore {
    void fillcoarsepatch_divu(int lev, amrex::Real a_time, amrex::MultiFab& a_divu, int nGhost);
    void fillcoarsepatch_gradp(int lev, amrex::Real a_time, amrex::MultiFab& a_gp, int nGhost);
    void fillcoarsepatch_reaction(int lev, amrex::Real a_time, amrex::MultiFab& a_I_R, int nGhost);
+   void fillcoarsepatch_chemFunctCall(int lev, amrex::Real a_time, amrex::MultiFab& a_fctC, int nGhost);
 
    // Fill physical boundaries
    void setInflowBoundaryVel (amrex::MultiFab &a_vel, int lev, PeleLM::TimeStamp a_time);
@@ -649,6 +662,50 @@ class PeleLM : public amrex::AmrCore {
 
    //-----------------------------------------------------------------------------
    // UTILS
+   template<class K>
+   void parseUserKey(amrex::ParmParse &a_pp,
+                     const std::string &a_search_key,
+                     const K &keyT,
+                     int &key,
+                     int keyIndex = -1)
+   {
+       std::string userkey = "default";
+       if (keyIndex < 0 ) {
+           a_pp.query(a_search_key.c_str(), userkey);
+       } else {
+           int search_key_size = a_pp.countval(a_search_key.c_str());
+           if (search_key_size > 0) {
+              AMREX_ASSERT(keyIndex < search_key_size);
+              amrex::Vector<std::string> userkeyVec(search_key_size);
+              a_pp.getarr(a_search_key.c_str(),userkeyVec,keyIndex,1);
+              userkey = userkeyVec[keyIndex];
+           }
+       }
+       std::transform(userkey.begin(), userkey.end(), userkey.begin(), ::tolower);
+       if (std::find(std::begin(keyT.searchKey),
+                     std::end(keyT.searchKey), a_search_key)
+                     == std::end(keyT.searchKey)) {
+          std::string errmsg = "Improper use of parseUserKey: search_key incompatible with provided userKey. \n";
+          errmsg += "The valid search keys are:\n";
+          for ( const auto &validKey : keyT.searchKey ) {
+               errmsg += " - " + validKey + " \n";
+          }
+          amrex::Abort(errmsg);
+       }
+
+       if (keyT.str2int.count(userkey) == 0) {
+          std::string errmsg = "Invalid key for search_key " + a_search_key + ". \n";
+          errmsg += "Valid keys are:\n";
+          for ( const auto &valid_pair : keyT.str2int ) {
+              if (valid_pair.first != "default"){
+                  errmsg += " - " + valid_pair.first + " \n";
+              }
+          }
+          amrex::Abort(errmsg);
+       }
+
+       key = keyT.str2int.at(userkey);
+   };
 
    // flux divergence
    void fluxDivergence(const amrex::Vector<amrex::MultiFab*> &a_divergence,
@@ -724,6 +781,9 @@ class PeleLM : public amrex::AmrCore {
 
    void floorSpecies(const PeleLM::TimeStamp &a_time);
 
+   void loadBalanceChem();
+   void loadBalanceChemLev(int a_lev);
+
    amrex::Real MLNorm0(const amrex::Vector<const amrex::MultiFab*> &a_MF);
    amrex::Vector<amrex::Real>
    MLNorm0(const amrex::Vector<const amrex::MultiFab*> &a_MF,
@@ -1099,12 +1159,12 @@ class PeleLM : public amrex::AmrCore {
    amrex::Vector<amrex::BoxArray> m_regrid_ba;
 
    // Problem parameters
-   ProbParm* prob_parm;
-   ProbParm* prob_parm_d;
+   ProbParm* prob_parm = nullptr;
+   ProbParm* prob_parm_d = nullptr;
 
 #ifdef PELELM_USE_SOOT
    // Soot parameters
-   SootModel* soot_model;
+   SootModel* soot_model = nullptr;
    bool do_soot_solve;
 #endif
 
@@ -1126,14 +1186,6 @@ class PeleLM : public amrex::AmrCore {
    amrex::Vector<std::unique_ptr<DiagBase>> m_diagnostics;
    amrex::Vector<std::string> m_diagVars;
 
-   // Linear solvers
-   std::unique_ptr<DiffusionOp> m_diffusion_op;
-   std::unique_ptr<DiffusionOp> m_mcdiffusion_op;
-   std::unique_ptr<DiffusionTensorOp> m_diffusionTensor_op;
-   std::unique_ptr<Hydro::MacProjector> macproj;
-   int m_macProjNeedReset = 0;
-   int m_macProjOldSize = 0;
-
    int m_verbose = 0;
 
    // IO options
@@ -1235,7 +1287,18 @@ class PeleLM : public amrex::AmrCore {
 
    // Misc.
    int m_floor_species = 0;
-   int m_checkMem = 0;
+
+   // Performances
+   int m_checkMem {0};
+   int m_doLoadBalance {0};
+   int m_loadBalanceCost {LoadBalanceCost::Ncell};
+   int m_loadBalanceMethod {LoadBalanceMethod::SFC};
+   int m_loadBalanceCostChem {LoadBalanceCost::ChemFunctCallMax};
+   int m_loadBalanceMethodChem {LoadBalanceMethod::Knapsack};
+   amrex::Real m_loadBalanceKSfactor {1.2};
+   amrex::Real m_loadBalanceEffRatioThreshold {1.1};
+   amrex::Vector<std::unique_ptr<amrex::LayoutData<amrex::Real> > > m_costs;
+   amrex::Vector<amrex::Real> m_loadBalanceEff;
 
    // SDC
    int m_nSDCmax = 1;
@@ -1312,7 +1375,7 @@ class PeleLM : public amrex::AmrCore {
    amrex::Vector<amrex::Real> typical_values;
 
 #ifdef AMREX_USE_EB
-   // EB refinement
+   // EB
    std::string m_EB_refine_type = "Static";
    int m_EB_refine_LevMax = -1;
    int m_EB_refine_LevMin = -1;
@@ -1353,6 +1416,12 @@ class PeleLM : public amrex::AmrCore {
 
    //-----------------------------------------------------------------------------
    // Linear Solvers
+   std::unique_ptr<DiffusionOp> m_diffusion_op;
+   std::unique_ptr<DiffusionOp> m_mcdiffusion_op;
+   std::unique_ptr<DiffusionTensorOp> m_diffusionTensor_op;
+   std::unique_ptr<Hydro::MacProjector> macproj;
+   int m_macProjNeedReset {0};
+   int m_macProjOldSize {0};
 
    // Nodal projection
    int m_nodal_mg_max_coarsening_level = 100;
diff --git a/Source/PeleLMBC.cpp b/Source/PeleLMBC.cpp
index 05a56033..a2df51a2 100644
--- a/Source/PeleLMBC.cpp
+++ b/Source/PeleLMBC.cpp
@@ -638,6 +638,36 @@ void PeleLM::fillpatch_reaction(int lev,
    }
 }
 
+// Fill functC
+void PeleLM::fillpatch_chemFunctCall(int lev,
+                                     const amrex::Real a_time,
+                                     amrex::MultiFab &a_fctC,
+                                     int nGhost) {
+   ProbParm const* lprobparm = prob_parm_d;
+   if (lev == 0) {
+      PhysBCFunct<GpuBndryFuncFab<PeleLMCCFillExtDirDummy>> bndry_func(geom[lev], {m_bcrec_force},
+                                                                     PeleLMCCFillExtDirDummy{lprobparm, m_nAux});
+      FillPatchSingleLevel(a_fctC, IntVect(nGhost), a_time,
+                           {&(m_leveldatareact[lev]->functC)},{a_time},
+                           0, 0, 1, geom[lev], bndry_func, 0);
+   } else {
+
+      // Interpolator
+      auto* mapper = getInterpolator();
+
+      PhysBCFunct<GpuBndryFuncFab<PeleLMCCFillExtDirDummy>> crse_bndry_func(geom[lev-1], {m_bcrec_force},
+                                                                            PeleLMCCFillExtDirDummy{lprobparm, m_nAux});
+      PhysBCFunct<GpuBndryFuncFab<PeleLMCCFillExtDirDummy>> fine_bndry_func(geom[lev], {m_bcrec_force},
+                                                                            PeleLMCCFillExtDirDummy{lprobparm, m_nAux});
+      FillPatchTwoLevels(a_fctC, IntVect(nGhost), a_time,
+                         {&(m_leveldatareact[lev-1]->functC)},{a_time},
+                         {&(m_leveldatareact[lev]->functC)},{a_time},
+                         0, 0, 1, geom[lev-1], geom[lev],
+                         crse_bndry_func, 0, fine_bndry_func, 0,
+                         refRatio(lev-1), mapper, {m_bcrec_force}, 0);
+   }
+}
+
 // Fill the state
 void PeleLM::fillcoarsepatch_state(int lev,
                                    const amrex::Real a_time,
@@ -730,6 +760,28 @@ void PeleLM::fillcoarsepatch_reaction(int lev,
                          refRatio(lev-1), mapper, {m_bcrec_force}, 0);
 }
 
+// Fill coarse patch of chem function call
+void PeleLM::fillcoarsepatch_chemFunctCall(int lev,
+                                           const amrex::Real a_time,
+                                           amrex::MultiFab &a_fctC,
+                                           int nGhost) {
+   ProbParm const* lprobparm = prob_parm_d;
+
+   // Interpolator
+   auto* mapper = getInterpolator(m_regrid_interp_method);
+
+   PhysBCFunct<GpuBndryFuncFab<PeleLMCCFillExtDirDummy>> crse_bndry_func(geom[lev-1], {m_bcrec_force},
+                                                                         PeleLMCCFillExtDirDummy{lprobparm, m_nAux});
+   PhysBCFunct<GpuBndryFuncFab<PeleLMCCFillExtDirDummy>> fine_bndry_func(geom[lev], {m_bcrec_force},
+                                                                         PeleLMCCFillExtDirDummy{lprobparm, m_nAux});
+   InterpFromCoarseLevel(a_fctC, IntVect(nGhost), a_time,
+                         m_leveldatareact[lev-1]->functC, 0, 0, 1,
+                         geom[lev-1], geom[lev],
+                         crse_bndry_func, 0, fine_bndry_func, 0,
+                         refRatio(lev-1), mapper, {m_bcrec_force}, 0);
+}
+
+
 // Fill the inflow boundary of a velocity MF
 // used for velocity projection
 void PeleLM::setInflowBoundaryVel(MultiFab &a_vel,
diff --git a/Source/PeleLMDeriveFunc.H b/Source/PeleLMDeriveFunc.H
index d763c07e..ebbfd4c6 100644
--- a/Source/PeleLMDeriveFunc.H
+++ b/Source/PeleLMDeriveFunc.H
@@ -98,6 +98,11 @@ void pelelm_derlambda (PeleLM* a_pelelm, const amrex::Box& bx, amrex::FArrayBox&
                        const amrex::Geometry& geom,
                        amrex::Real time, const amrex::Vector<amrex::BCRec> &bcrec, int level);
 
+void pelelm_derdmap (PeleLM* a_pelelm, const amrex::Box& bx, amrex::FArrayBox& derfab, int dcomp, int ncomp,
+                     const amrex::FArrayBox& statefab, const amrex::FArrayBox& reactfab, const amrex::FArrayBox& pressfab,
+                     const amrex::Geometry& geom,
+                     amrex::Real time, const amrex::Vector<amrex::BCRec> &bcrec, int level);
+
 amrex::Vector<std::string> pelelm_setuserderives();
 
 void pelelm_deruserdef (PeleLM* a_pelelm, const amrex::Box& bx, amrex::FArrayBox& derfab, int dcomp, int ncomp,
diff --git a/Source/PeleLMDeriveFunc.cpp b/Source/PeleLMDeriveFunc.cpp
index 0d82c0aa..82c98509 100644
--- a/Source/PeleLMDeriveFunc.cpp
+++ b/Source/PeleLMDeriveFunc.cpp
@@ -672,3 +672,22 @@ void pelelm_derlambda (PeleLM* a_pelelm, const Box& bx, FArrayBox& derfab, int d
       }
     });
 }
+
+//
+// Extract Distribution Mapping
+//
+void pelelm_derdmap (PeleLM* a_pelelm, const Box& bx, FArrayBox& derfab, int dcomp, int /*ncomp*/,
+                     const FArrayBox& /*statefab*/, const FArrayBox& /*reactfab*/, const FArrayBox& /*pressfab*/,
+                     const Geometry& /*geom*/,
+                     Real /*time*/, const Vector<BCRec>& /*bcrec*/, int /*level*/)
+
+{
+    AMREX_ASSERT(derfab.box().contains(bx));
+    auto       der = derfab.array(dcomp);
+    const int myrank = ParallelDescriptor::MyProc();
+    amrex::ParallelFor(bx,
+    [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+    {
+        der(i,j,k) = myrank;
+    });
+}
diff --git a/Source/PeleLMInit.cpp b/Source/PeleLMInit.cpp
index 5176937d..a60bd3d1 100644
--- a/Source/PeleLMInit.cpp
+++ b/Source/PeleLMInit.cpp
@@ -88,6 +88,9 @@ void PeleLM::MakeNewLevelFromScratch( int lev,
    m_t_new[lev] = time;
    m_t_old[lev] = time - 1.0e200;
 
+   // Load balance
+   m_costs[lev] = std::make_unique<LayoutData<Real>>(ba, dm);
+
    // Mac projector
 #ifdef AMREX_USE_EB
    macproj.reset(new Hydro::MacProjector(Geom(0,finest_level),
@@ -247,6 +250,9 @@ void PeleLM::initData() {
                         advanceChemistry(lev, dtInit/2.0, Forcing);
                      }
                   }
+                  if (m_doLoadBalance) {
+                     loadBalanceChemLev(lev);
+                  }
                }
                // Copy back old -> new
                copyStateOldToNew();
diff --git a/Source/PeleLMReactions.cpp b/Source/PeleLMReactions.cpp
index b5f130e3..141e3b09 100644
--- a/Source/PeleLMReactions.cpp
+++ b/Source/PeleLMReactions.cpp
@@ -25,8 +25,7 @@ void PeleLM::advanceChemistry(std::unique_ptr<AdvanceAdvData> &advData)
 }
 
 // This advanceChemistry is called on the finest level
-// It works with the AmrCore BoxArray and do not involve ParallelCopy and averaged down
-// version of I_R
+// It works with the AmrCore BoxArray and do not involve ParallelCopy
 void PeleLM::advanceChemistry(int lev,
                               const Real &a_dt,
                               MultiFab &a_extForcing)
diff --git a/Source/PeleLMRegrid.cpp b/Source/PeleLMRegrid.cpp
index 8660e846..7cf8bb35 100644
--- a/Source/PeleLMRegrid.cpp
+++ b/Source/PeleLMRegrid.cpp
@@ -6,11 +6,253 @@ void PeleLM::regrid(int lbase,
                     amrex::Real time,
                     bool initial)
 {
-  if (lbase >= max_level) return;
+  BL_PROFILE("PeleLM::regrid()");
+
+  if (!m_doLoadBalance && lbase >= max_level) return;
+
   if (!m_regrid_file.empty()) {
      regridFromGridFile(lbase, time, initial);
+
   } else {
-     AmrCore::regrid(lbase, time, initial);
+
+     // Load balance base grid
+     if (lbase == 0 && m_doLoadBalance && !initial) {
+
+         if ( m_verbose > 0) {
+             Print() << " Load balancing level 0 \n";
+         }
+
+         int remakeLevel = false;
+
+         computeCosts(0);
+
+         // Use efficiency: average MPI rank cost / max cost
+         amrex::Real currentEfficiency = 0.0;
+         amrex::Real testEfficiency = 0.0;
+
+         DistributionMapping test_dmap;
+         // Build the test dmap, w/o braodcasting
+         if (m_loadBalanceMethod == LoadBalanceMethod::SFC) {
+
+             test_dmap = DistributionMapping::makeSFC(*m_costs[0],
+                                                      currentEfficiency, testEfficiency,
+                                                      false,
+                                                      ParallelDescriptor::IOProcessorNumber());
+
+         } else if (m_loadBalanceMethod == LoadBalanceMethod::Knapsack) {
+
+             const amrex::Real navg = static_cast<Real>(grids[0].size()) /
+                                      static_cast<Real>(ParallelDescriptor::NProcs());
+             const int nmax = static_cast<int>(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg)));
+             test_dmap = DistributionMapping::makeKnapSack(*m_costs[0],
+                                                           currentEfficiency, testEfficiency,
+                                                           nmax,
+                                                           false,
+                                                           ParallelDescriptor::IOProcessorNumber());
+         }
+
+         // IO proc determine if the test dmap offers significant improvements
+         if ((m_loadBalanceEffRatioThreshold > 0.0)
+             && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))
+         {
+             remakeLevel = remakeLevel || (testEfficiency > m_loadBalanceEffRatioThreshold*currentEfficiency);
+         }
+         ParallelDescriptor::Bcast(&remakeLevel, 1, ParallelDescriptor::IOProcessorNumber());
+
+         if ( m_verbose > 1 && remakeLevel) {
+             Print() << " Current LoadBalancing efficiency: " << currentEfficiency << "\n"
+                     << " Test LoadBalancing efficiency: " << testEfficiency << " \n";
+         }
+
+         // Bcast the test dmap and remake level
+         if (remakeLevel) {
+             Vector<int> pmap;
+             if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) {
+                 pmap = test_dmap.ProcessorMap();
+             } else {
+                 pmap.resize(static_cast<std::size_t>(grids[0].size()));
+             }
+             ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());
+
+             if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) {
+                 test_dmap = DistributionMapping(pmap);
+             }
+
+             RemakeLevel(0, time, grids[0], test_dmap);
+             SetDistributionMap(0, test_dmap);
+         }
+     }
+
+     if (lbase >= max_level) return;
+
+     int new_finest;
+     Vector<BoxArray> new_grids(finest_level+2);
+     MakeNewGrids(lbase, time, new_finest, new_grids);
+
+     BL_ASSERT(new_finest <= finest_level+1);
+
+     bool coarse_ba_changed = false;
+     for (int lev = lbase+1; lev <= new_finest; ++lev)
+     {
+         if (lev <= finest_level) // an old level
+         {
+             bool ba_changed = (new_grids[lev] != grids[lev]);
+
+             // If initial grid generation or not doing load balancing,
+             // remake level only if the grid changed using the default DMap strategy
+             if ( initial || !m_doLoadBalance ) {
+                 if ( ba_changed || coarse_ba_changed ) {
+                     BoxArray level_grids = grids[lev];
+                     DistributionMapping level_dmap = dmap[lev];
+                     if (ba_changed) {
+                         level_grids = new_grids[lev];
+                         level_dmap = DistributionMapping(level_grids);
+                     }
+                     const auto old_num_setdm = num_setdm;
+                     RemakeLevel(lev, time, level_grids, level_dmap);
+                     SetBoxArray(lev, level_grids);
+                     if (old_num_setdm == num_setdm) {
+                         SetDistributionMap(lev, level_dmap);
+                     }
+                 }
+
+             // When doing load balancing, remake level if grid changed or a better dmap
+             // can be obtained
+             } else {
+
+                 int remakeLevel = false;
+
+                 BoxArray new_ba;
+                 DistributionMapping new_dmap;
+
+                 // If the grid changed, let's build a new dmap
+                 if (ba_changed) {
+
+                     remakeLevel = true;
+
+                     new_ba = new_grids[lev];
+                     new_dmap = DistributionMapping(new_ba);
+
+                     // Get the cost on a LayoutData associated with the new grid
+                     LayoutData<Real> new_cost(new_ba,new_dmap);
+                     computeCosts(lev, new_cost, m_loadBalanceCost);
+
+                     if (m_loadBalanceMethod == LoadBalanceMethod::SFC) {
+                         Vector<Real> costsVec(new_ba.size());
+                         ParallelDescriptor::GatherLayoutDataToVector(new_cost, costsVec,
+                                                                      ParallelContext::IOProcessorNumberSub());
+                         ParallelDescriptor::Bcast(costsVec.data(), costsVec.size(), ParallelContext::IOProcessorNumberSub());
+                         Real efficiency;
+                         new_dmap = DistributionMapping::makeSFC(costsVec, new_ba, efficiency);
+
+                     } else if (m_loadBalanceMethod == LoadBalanceMethod::Knapsack) {
+
+                         const amrex::Real navg = static_cast<Real>(new_ba.size()) /
+                                                  static_cast<Real>(ParallelDescriptor::NProcs());
+                         const int nmax = static_cast<int>(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg)));
+                         Vector<Real> costsVec(new_ba.size());
+                         ParallelDescriptor::GatherLayoutDataToVector(new_cost, costsVec,
+                                                                      ParallelContext::IOProcessorNumberSub());
+                         ParallelDescriptor::Bcast(costsVec.data(), costsVec.size(), ParallelContext::IOProcessorNumberSub());
+                         Real efficiency;
+                         new_dmap = DistributionMapping::makeKnapSack(costsVec, efficiency, nmax);
+                     }
+
+                 // Let's see if we can get a better dmap
+                 } else {
+
+                     if ( m_verbose > 1) {
+                         Print() << " Load balancing level " << lev << "\n";
+                     }
+
+                     // Try to build a new dmap with the same old grid
+                     new_ba = grids[lev];
+                     DistributionMapping test_dmap;
+
+                     computeCosts(lev);
+
+                     // Use efficiency: average MPI rank cost / max cost
+                     amrex::Real currentEfficiency = 0.0;
+                     amrex::Real testEfficiency = 0.0;
+
+                     // Build the test dmap, w/o braodcasting
+                     if (m_loadBalanceMethod == LoadBalanceMethod::SFC) {
+
+                         test_dmap = DistributionMapping::makeSFC(*m_costs[lev],
+                                                                  currentEfficiency, testEfficiency,
+                                                                  false,
+                                                                  ParallelDescriptor::IOProcessorNumber());
+
+                     } else if (m_loadBalanceMethod == LoadBalanceMethod::Knapsack) {
+
+                         const amrex::Real navg = static_cast<Real>(new_ba.size()) /
+                                                  static_cast<Real>(ParallelDescriptor::NProcs());
+                         const int nmax = static_cast<int>(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg)));
+                         test_dmap = DistributionMapping::makeKnapSack(*m_costs[lev],
+                                                                       currentEfficiency, testEfficiency,
+                                                                       nmax,
+                                                                       false,
+                                                                       ParallelDescriptor::IOProcessorNumber());
+                     }
+
+                     // IO proc determine if the test dmap offers significant improvements
+                     if ((m_loadBalanceEffRatioThreshold > 0.0)
+                         && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))
+                     {
+                         remakeLevel = remakeLevel || (testEfficiency > m_loadBalanceEffRatioThreshold*currentEfficiency);
+                     }
+                     ParallelDescriptor::Bcast(&remakeLevel, 1, ParallelDescriptor::IOProcessorNumber());
+
+                     if ( m_verbose > 1 && remakeLevel) {
+                         Print() << " Current LoadBalancing efficiency: " << currentEfficiency << "\n"
+                                 << " Test LoadBalancing efficiency: " << testEfficiency << " \n";
+                     }
+
+                     // Bcast the test dmap if we plan on remaking the level
+                     if (remakeLevel) {
+                         Vector<int> pmap;
+                         if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) {
+                             pmap = test_dmap.ProcessorMap();
+                         } else {
+                             pmap.resize(static_cast<std::size_t>(new_ba.size()));
+                         }
+                         ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());
+
+                         if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) {
+                             test_dmap = DistributionMapping(pmap);
+                         }
+                         new_dmap = test_dmap;
+                     }
+                 }
+
+                 if (remakeLevel) {
+                     const auto old_num_setdm = num_setdm;
+                     RemakeLevel(lev, time, new_ba, new_dmap);
+                     SetBoxArray(lev, new_ba);
+                     if (old_num_setdm == num_setdm) {
+                         SetDistributionMap(lev, new_dmap);
+                     }
+                 }
+             }
+             coarse_ba_changed = ba_changed;
+         } else {  // a new level, use default DMap strategy
+             DistributionMapping new_dmap(new_grids[lev]);
+             const auto old_num_setdm = num_setdm;
+             MakeNewLevelFromCoarse(lev, time, new_grids[lev], new_dmap);
+             SetBoxArray(lev, new_grids[lev]);
+             if (old_num_setdm == num_setdm) {
+                 SetDistributionMap(lev, new_dmap);
+             }
+         }
+     }
+
+     for (int lev = new_finest+1; lev <= finest_level; ++lev) {
+         ClearLevel(lev);
+         ClearBoxArray(lev);
+         ClearDistributionMap(lev);
+     }
+
+     finest_level = new_finest;
   }
 }
 
@@ -25,7 +267,7 @@ void PeleLM::MakeNewLevelFromCoarse( int lev,
       if (m_verbose > 2) {
          auto const dx = geom[lev].CellSizeArray();
          Real vol = AMREX_D_TERM(dx[0],*dx[1],*dx[2]);
-         amrex::Print() << " with " << ba.numPts() << " cells,"
+         amrex::Print() << " with " << ba.numPts() << " cells, " << ba.size() << " boxes,"
                         << " over " << ba.numPts() * vol / geom[0].ProbSize() * 100 << "% of the domain \n";
       }
       if (m_verbose > 3) {
@@ -101,6 +343,9 @@ void PeleLM::MakeNewLevelFromCoarse( int lev,
    m_precond_op.reset();
 #endif
 
+   // Load balance
+   m_costs[lev] = std::make_unique<LayoutData<Real>>(ba, dm);
+
    // DiffusionOp will be recreated
    m_diffusion_op.reset();
    m_mcdiffusion_op.reset();
@@ -124,7 +369,7 @@ void PeleLM::RemakeLevel( int lev,
       if (m_verbose > 2) {
          auto const dx = geom[lev].CellSizeArray();
          Real vol = AMREX_D_TERM(dx[0],*dx[1],*dx[2]);
-         amrex::Print() << " with " << ba.numPts() << " cells,"
+         amrex::Print() << " with " << ba.numPts() << " cells," << ba.size() << " boxes,"
                         << " over " << ba.numPts() * vol / geom[0].ProbSize() * 100 << "% of the domain \n";
       }
       if (m_verbose > 3) {
@@ -200,6 +445,9 @@ void PeleLM::RemakeLevel( int lev,
    m_precond_op.reset();
 #endif
 
+   // Load balance
+   m_costs[lev] = std::make_unique<LayoutData<Real>>(ba, dm);
+
    // DiffusionOp will be recreated
    m_diffusion_op.reset();
    m_mcdiffusion_op.reset();
@@ -233,6 +481,67 @@ void PeleLM::ClearLevel(int lev) {
    }
 #endif
    m_extSource[lev]->clear();
+
+   m_costs[lev].reset();
+   m_loadBalanceEff[lev] = -1.0;
+}
+
+void PeleLM::computeCosts(int a_lev, LayoutData<Real> &a_costs, int a_costMethod)
+{
+    if (a_costMethod == LoadBalanceCost::Ncell ) {
+        for (MFIter mfi(a_costs, false); mfi.isValid(); ++mfi)
+        {
+            a_costs[mfi] = mfi.validbox().numPts();
+        }
+    } else if (a_costMethod == LoadBalanceCost::ChemFunctCallAvg ) {
+        MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0);
+        fillpatch_chemFunctCall(a_lev, m_cur_time, costMF, 0);
+        for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi)
+        {
+            a_costs[mfi] = costMF[mfi].sum<RunOn::Device>(mfi.validbox(),0) / mfi.validbox().numPts();
+        }
+    } else if (a_costMethod == LoadBalanceCost::ChemFunctCallMax ) {
+        MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0);
+        fillpatch_chemFunctCall(a_lev, m_cur_time, costMF, 0);
+        for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi)
+        {
+            a_costs[mfi] = costMF[mfi].max<RunOn::Device>(mfi.validbox(),0);
+        }
+    } else if (a_costMethod == LoadBalanceCost::ChemFunctCallSum ) {
+        MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0);
+        fillpatch_chemFunctCall(a_lev, m_cur_time, costMF, 0);
+        for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi)
+        {
+            a_costs[mfi] = costMF[mfi].sum<RunOn::Device>(mfi.validbox(),0);
+        }
+    } else if (a_costMethod == LoadBalanceCost::UserDefinedDerivedAvg) {
+        MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0);
+        costMF.setVal(0.0);
+        std::unique_ptr<MultiFab> mf;
+        mf = derive("derUserDefined", m_cur_time, a_lev, 0);
+        costMF.ParallelCopy(*mf,0,0,1);
+        for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi)
+        {
+            a_costs[mfi] = costMF[mfi].sum<RunOn::Device>(mfi.validbox(),0) / mfi.validbox().numPts();
+        }
+    } else if (a_costMethod == LoadBalanceCost::UserDefinedDerivedSum) {
+        MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0);
+        costMF.setVal(0.0);
+        std::unique_ptr<MultiFab> mf;
+        mf = derive("derUserDefined", m_cur_time, a_lev, 0);
+        costMF.ParallelCopy(*mf,0,0,1);
+        for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi)
+        {
+            a_costs[mfi] = costMF[mfi].sum<RunOn::Device>(mfi.validbox(),0);
+        }
+    } else {
+        Abort(" Unknown cost estimate method !");
+    }
+}
+
+void PeleLM::computeCosts(int a_lev)
+{
+    computeCosts(a_lev, *m_costs[a_lev], m_loadBalanceCost);
 }
 
 void PeleLM::resetMacProjector()
diff --git a/Source/PeleLMSetup.cpp b/Source/PeleLMSetup.cpp
index 517f3275..e36b98e4 100644
--- a/Source/PeleLMSetup.cpp
+++ b/Source/PeleLMSetup.cpp
@@ -408,6 +408,25 @@ void PeleLM::readParameters() {
       }
    }
 
+   // -----------------------------------------
+   // Load Balancing
+   // -----------------------------------------
+   pp.query("do_load_balancing",m_doLoadBalance);
+   parseUserKey(pp, "load_balancing_method", lbmethod, m_loadBalanceMethod);
+   parseUserKey(pp, "load_balancing_cost_estimate", lbcost, m_loadBalanceCost);
+   pp.query("load_balancing_efficiency_threshold",m_loadBalanceEffRatioThreshold);
+   parseUserKey(pp, "chem_load_balancing_method", lbmethod, m_loadBalanceMethodChem);
+   parseUserKey(pp, "chem_load_balancing_cost_estimate", lbcost, m_loadBalanceCostChem);
+
+   // Deactivate load balancing for serial runs
+#ifdef AMREX_USE_MPI
+   if (ParallelContext::NProcsSub() == 1) {
+       m_doLoadBalance = 0;
+   }
+#else
+   m_doLoadBalance = 0;
+#endif
+
    // -----------------------------------------
    // Advection
    // -----------------------------------------
@@ -485,7 +504,7 @@ void PeleLM::readParameters() {
    ppa.query("max_dt", m_max_dt);
    ppa.query("min_dt", m_min_dt);
 
-   if ( max_level > 0 ) {
+   if ( max_level > 0 || m_doLoadBalance) {
       ppa.query("regrid_int", m_regrid_int);
       ppa.query("regrid_on_restart", m_regrid_on_restart);
    }
@@ -858,6 +877,9 @@ void PeleLM::derivedSetup()
 
    }
 
+   // Distribution Map
+   derive_lst.add("DistributionMap",IndexType::TheCellType(),1,pelelm_derdmap,the_same_box);
+
    // Cell average pressure
    derive_lst.add("avg_pressure",IndexType::TheCellType(),1,pelelm_deravgpress,the_same_box);
 
@@ -1125,8 +1147,13 @@ void PeleLM::resizeArray() {
    // Time
    m_t_old.resize(max_level+1);
    m_t_new.resize(max_level+1);
+
 #ifdef PELELM_USE_SPRAY
    m_spraystate.resize(max_level+1);
    m_spraysource.resize(max_level+1);
 #endif
+
+   // Load balancing
+   m_costs.resize(max_level+1);
+   m_loadBalanceEff.resize(max_level+1);
 }
diff --git a/Source/PeleLMUserKeys.H b/Source/PeleLMUserKeys.H
new file mode 100644
index 00000000..53df4ef3
--- /dev/null
+++ b/Source/PeleLMUserKeys.H
@@ -0,0 +1,168 @@
+#ifndef PELE_USERKEYS_H
+#define PELE_USERKEYS_H
+#include <AMReX_Array.H>
+
+#include <string>
+#include <map>
+
+/**
+  * \brief struct holding PeleLMeX physical BC options
+  */
+struct BoundaryCondition {
+    enum {
+        BCInterior = 0,
+        BCInflow,
+        BCOutflow,
+        BCSymmetry,
+        BCSlipWallAdiab,
+        BCNoSlipWallAdiab,
+        BCSlipWallIsotherm,
+        BCNoSlipWallIsotherm
+    };
+    const std::map<const std::string, int> str2int = {
+        {"interior", BCInterior},
+        {"inflow", BCInflow},
+        {"outflow", BCOutflow},
+        {"symmetry", BCSymmetry},
+        {"slipwalladiab", BCSlipWallAdiab},
+        {"noslipwalladiab", BCNoSlipWallAdiab},
+        {"bcslipwallisotherm", BCSlipWallIsotherm},
+        {"bcnoslipwallisotherm", BCNoSlipWallIsotherm}
+    };
+    const amrex::Array<std::string,2> searchKey{"lo_bc","hi_bc"};
+};
+const BoundaryCondition boundarycondition;
+
+/**
+  * \brief struct holding PeleLMeX NS solver options
+           default is LowMachNumber
+  */
+struct NSSolver {
+    enum {
+        LowMachNumber = 0,
+        Incompressible
+    };
+    const std::map<std::string, int> str2int = {
+        {"lowmachnumber", LowMachNumber},
+        {"incompressible", Incompressible},
+        {"default", LowMachNumber}
+    };
+    const amrex::Array<std::string,1> searchKey{"ns_solver"};
+};
+const NSSolver nssolver;
+
+/**
+  * \brief struct holding PeleLMeX interpolator options
+           default is PiecewiseLinearConserv
+  */
+struct Interpolator {
+    enum {
+        PiecewiseLinearConserv = 0,
+        PiecewiseLinearConservMinMax,
+        PiecewiseConstant
+    };
+    const std::map<const std::string, int> str2int = {
+        {"pwlinearconserv", PiecewiseLinearConserv},
+        {"pwlinearconservminmax", PiecewiseLinearConservMinMax},
+        {"pwconstant", PiecewiseConstant},
+        {"default", PiecewiseLinearConserv}
+    };
+    const amrex::Array<std::string,1> searchKey{"regrid_interp_method"};
+};
+const Interpolator interpolator;
+
+/**
+  * \brief struct holding PeleLMeX LES SGS model options
+           default is None
+  */
+struct LESModel {
+    enum {
+        None = 0,
+        Smagorinsky,
+        WALE,
+        Sigma
+    };
+    const std::map<const std::string, int> str2int = {
+        {"none", None},
+        {"smagorinsky", Smagorinsky},
+        {"wale", WALE},
+        {"sigma", Sigma},
+        {"default", None}
+    };
+    const amrex::Array<std::string,1> searchKey{"les_model"};
+};
+const LESModel lesmodel;
+
+/**
+  * \brief struct holding PeleLMeX advection scheme options
+           default is Godunov_PLM
+  */
+struct AdvectionScheme {
+    enum {
+        Godunov_PLM = 0,
+        Godunov_PPM,
+        Godunov_PPM_WENOZ,
+        Godunov_PPM_NOLIM,
+        Godunov_BDS
+    };
+    const std::map<const std::string, int> str2int = {
+        {"godunov_plm", Godunov_PLM},
+        {"godunov_ppm", Godunov_PPM},
+        {"godunov_ppm_wenoz", Godunov_PPM_WENOZ},
+        {"godunov_ppm_nolim", Godunov_PPM_NOLIM},
+        {"godunov_bds", Godunov_BDS},
+        {"default", Godunov_PLM}
+    };
+    const amrex::Array<std::string,1> searchKey{"advection_scheme"};
+};
+const AdvectionScheme advscheme;
+
+/**
+  * \brief struct holding PeleLMeX load balancing cost options
+           default is Ncell
+  */
+struct LoadBalanceCost {
+    enum {
+        Ncell = 0,
+        ChemFunctCallAvg,
+        ChemFunctCallMax,
+        ChemFunctCallSum,
+        UserDefinedDerivedAvg,
+        UserDefinedDerivedSum,
+        Timers
+    };
+    const std::map<const std::string, int> str2int = {
+        {"ncell", Ncell},
+        {"chemfunctcall_avg", ChemFunctCallAvg},
+        {"chemfunctcall_max", ChemFunctCallMax},
+        {"chemfunctcall_sum", ChemFunctCallSum},
+        {"userdefined_avg", UserDefinedDerivedAvg},
+        {"userdefined_sum", UserDefinedDerivedSum},
+        {"default", Ncell}
+    };
+    const amrex::Array<std::string,2> searchKey{"load_balancing_cost_estimate",
+                                                "chem_load_balancing_cost_estimate"};
+};
+const LoadBalanceCost lbcost;
+
+/**
+  * \brief struct holding PeleLMeX load balancing method options
+           default is SFC
+  */
+struct LoadBalanceMethod {
+    enum {
+        SFC = 0,
+        Knapsack,
+        RoundRobin
+    };
+    const std::map<const std::string, int> str2int = {
+        {"sfc", SFC},
+        {"knapsack", Knapsack},
+        {"roundrobin", RoundRobin},
+        {"default", SFC}
+    };
+    const amrex::Array<std::string,2> searchKey{"load_balancing_method",
+                                                "chem_load_balancing_method"};
+};
+const LoadBalanceMethod lbmethod;
+#endif
diff --git a/Source/PeleLMUtils.cpp b/Source/PeleLMUtils.cpp
index c7b88c68..c0bf3545 100644
--- a/Source/PeleLMUtils.cpp
+++ b/Source/PeleLMUtils.cpp
@@ -602,6 +602,7 @@ void PeleLM::resetCoveredMask()
       if (m_verbose) Print() << " Resetting fine-covered cells mask \n";
 
       for (int lev = 0; lev < finest_level; ++lev) {
+         // Set a fine-covered mask
          BoxArray baf = grids[lev+1];
          baf.coarsen(ref_ratio[lev]);
          m_coveredMask[lev]->setVal(1);
@@ -653,21 +654,35 @@ void PeleLM::resetCoveredMask()
          }
          m_baChem[lev].reset(new BoxArray(std::move(bl)));
          m_dmapChem[lev].reset(new DistributionMapping(*m_baChem[lev]));
+
+         // Load balancing of the chemistry DMap
+         if (m_doLoadBalance) {
+             loadBalanceChemLev(lev);
+         }
       }
 
       // Set a BoxArray for the chemistry on the finest level too
       m_baChem[finest_level].reset(new BoxArray(grids[finest_level]));
       if ( m_max_grid_size_chem.min() > 0 ) {
-         m_baChem[finest_level]->maxSize(m_max_grid_size_chem);
+          m_baChem[finest_level]->maxSize(m_max_grid_size_chem);
       }
       m_baChemFlag[finest_level].resize(m_baChem[finest_level]->size());
       std::fill(m_baChemFlag[finest_level].begin(), m_baChemFlag[finest_level].end(), 1);
       m_dmapChem[finest_level].reset(new DistributionMapping(*m_baChem[finest_level]));
 
+      if (m_doLoadBalance && m_max_grid_size_chem.min() > 0) {
+          loadBalanceChemLev(finest_level);
+      }
+
       // Switch off trigger
       m_resetCoveredMask = 0;
-   }
 
+   } else {
+      // Just load balance the chem. distribution map
+      if (m_doLoadBalance) {
+         loadBalanceChem();
+      }
+   }
 
    //----------------------------------------------------------------------------
    // Need to compute the uncovered volume
@@ -680,9 +695,84 @@ void PeleLM::resetCoveredMask()
       }
       m_uncoveredVol = MFSum(GetVecOfConstPtrs(dummy),0);
    }
+}
+
+void
+PeleLM::loadBalanceChem() {
+
+    for (int lev = 0; lev <= finest_level; ++lev) {
+        // Finest grid uses AmrCore DM unless different max grid size specified.
+        // Keep the AmrCore DM.
+        if ( lev == finest_level && m_max_grid_size_chem.min() < 0 ) {
+            continue;
+        }
+        loadBalanceChemLev(lev);
+    }
+}
+
+void
+PeleLM::loadBalanceChemLev(int a_lev) {
+
+    LayoutData<Real> new_cost(*m_baChem[a_lev],*m_dmapChem[a_lev]);
+    computeCosts(a_lev, new_cost, m_loadBalanceCostChem);
+
+    // Use efficiency: average MPI rank cost / max cost
+    amrex::Real currentEfficiency = 0.0;
+    amrex::Real testEfficiency = 0.0;
+
+    DistributionMapping test_dmap;
+    // Build the test dmap, w/o braodcasting
+    if (m_loadBalanceMethodChem == LoadBalanceMethod::SFC) {
+
+        test_dmap = DistributionMapping::makeSFC(new_cost,
+                                                 currentEfficiency, testEfficiency,
+                                                 false,
+                                                 ParallelDescriptor::IOProcessorNumber());
+
+    } else if (m_loadBalanceMethodChem == LoadBalanceMethod::Knapsack) {
+
+        const amrex::Real navg = static_cast<Real>(m_baChem[a_lev]->size()) /
+                                 static_cast<Real>(ParallelDescriptor::NProcs());
+        const int nmax = static_cast<int>(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg)));
+        test_dmap = DistributionMapping::makeKnapSack(new_cost,
+                                                      currentEfficiency, testEfficiency,
+                                                      nmax,
+                                                      false,
+                                                      ParallelDescriptor::IOProcessorNumber());
+    }
 
+    // IO proc determine if the test dmap offers significant improvements
+    int updateDmap = false;
+    if ((m_loadBalanceEffRatioThreshold > 0.0)
+        && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))
+    {
+        updateDmap = testEfficiency > m_loadBalanceEffRatioThreshold*currentEfficiency;
+    }
+    ParallelDescriptor::Bcast(&updateDmap, 1, ParallelDescriptor::IOProcessorNumber());
+
+    if ( m_verbose > 2 && updateDmap) {
+        Print() << "   Old Chem LoadBalancing efficiency on a_lev " << a_lev << ": " << currentEfficiency << "\n"
+                << "   New Chem LoadBalancing efficiency: " << testEfficiency << " \n";
+    }
+
+    // Bcast the test dmap if better
+    if (updateDmap) {
+        Vector<int> pmap;
+        if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) {
+            pmap = test_dmap.ProcessorMap();
+        } else {
+            pmap.resize(static_cast<std::size_t>(m_baChem[a_lev]->size()));
+        }
+        ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());
+
+        if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) {
+            test_dmap = DistributionMapping(pmap);
+        }
+        m_dmapChem[a_lev].reset(new DistributionMapping(test_dmap));
+    }
 }
 
+
 // Return a unique_ptr with the entire derive
 std::unique_ptr<MultiFab>
 PeleLM::derive(const std::string &a_name,
diff --git a/Submodules/AMReX-Hydro b/Submodules/AMReX-Hydro
index 38ad3080..7aa8b657 160000
--- a/Submodules/AMReX-Hydro
+++ b/Submodules/AMReX-Hydro
@@ -1 +1 @@
-Subproject commit 38ad3080464e4bfaf8c3c5a17aa7df0e80d85f8d
+Subproject commit 7aa8b65782d0309fb5b84926f906f306edfa4509
diff --git a/Submodules/PelePhysics b/Submodules/PelePhysics
index 2108b3ef..29a49c38 160000
--- a/Submodules/PelePhysics
+++ b/Submodules/PelePhysics
@@ -1 +1 @@
-Subproject commit 2108b3efda4d5fbfc2e6e46e0982d70a968a0191
+Subproject commit 29a49c38a68eb39146ca8489c599e0a0d2ea49ad
diff --git a/Submodules/amrex b/Submodules/amrex
index bacaa10a..b0a03791 160000
--- a/Submodules/amrex
+++ b/Submodules/amrex
@@ -1 +1 @@
-Subproject commit bacaa10a76366ed40b90b422cf5c7da43565281a
+Subproject commit b0a037918f78a5e9f32988c4b0016093929954e7