From e5864ff868a6357fd34bceafee60e0979a6c6bbc Mon Sep 17 00:00:00 2001 From: Lucas Esclapez <13371051+esclapez@users.noreply.github.com> Date: Fri, 28 Apr 2023 11:44:36 -0700 Subject: [PATCH] Add load balancing control (#198) * Redo load balancing (only). * Update submodules. * Describe additional derived vars. * Add a brief description of the code base. * A few fixes. * Add control doc. * Trailing whitespaces. * Use LMeX version of the diagnostics only if requested. * Add a couple of details to address Bruce's comments. --- CMakeLists.txt | 1 + Docs/source/manual/Implementation.rst | 197 ++++++++++++++++ Docs/source/manual/LMeXControls.rst | 31 ++- Docs/source/manual/Model.rst | 9 +- Docs/source/manual/index.rst | 1 + Source/CMakeLists.txt | 4 +- Source/PeleLM.H | 135 ++++++++--- Source/PeleLMBC.cpp | 52 +++++ Source/PeleLMDeriveFunc.H | 5 + Source/PeleLMDeriveFunc.cpp | 19 ++ Source/PeleLMInit.cpp | 6 + Source/PeleLMReactions.cpp | 3 +- Source/PeleLMRegrid.cpp | 317 +++++++++++++++++++++++++- Source/PeleLMSetup.cpp | 29 ++- Source/PeleLMUserKeys.H | 168 ++++++++++++++ Source/PeleLMUtils.cpp | 94 +++++++- Submodules/AMReX-Hydro | 2 +- Submodules/PelePhysics | 2 +- Submodules/amrex | 2 +- 19 files changed, 1027 insertions(+), 50 deletions(-) create mode 100644 Docs/source/manual/Implementation.rst create mode 100644 Source/PeleLMUserKeys.H diff --git a/CMakeLists.txt b/CMakeLists.txt index 57ddcd5c..a67215ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ option(PELELMEX_USE_EB "Enable Embedded Boundary" OFF) option(PELELMEX_USE_EFIELD "Enable Electric field module" OFF) option(PELELMEX_USE_PARTICLES "Enable Spray module" OFF) option(PELELMEX_USE_SOOT "Enable Soot module" OFF) +option(PELELMEX_USE_DIAG "Enable LMeX Diagn module" OFF) # # Misc options diff --git a/Docs/source/manual/Implementation.rst b/Docs/source/manual/Implementation.rst new file mode 100644 index 00000000..760c1ea5 --- /dev/null +++ b/Docs/source/manual/Implementation.rst @@ -0,0 +1,197 @@ +.. highlight:: rst + +.. _sec:code: + +Source code +=========== + +The following provides an overview of *PeleLMeX* source code, basic information on the data structure and is +useful for any user intending to use and/or do some development in the code. + +Overview of source code +----------------------- + +*PeleLMeX* is based upon AMReX's `AmrCore `_ from which it inherits +an AMR hierarchy data structure and basic regridding functionnalities. The code is entirely written in C++, with low level +compute-intensive kernels implemented as lambda functions to seamlessly run on CPU and various GPU backends through AMReX +high performance portatbility abstraction. + +The core of the algorithm is implementation in the ``advance()`` function which acts on all the levels concurrently. +Projection operators and advection scheme functions are imported the `AMReX-Hydro library `_ +while the core of the thermo-chemistry functionalities comes from `PelePhysics `_ . +Users are responsible for providing initial and boundary conditions in the local subfolder implementing their case, i.e. it is +not possible to compile and run *PeleLMeX* without actually writting a few lines of codes. However, numerous example are provided +in ``Exec/RegTests`` from which new users can pull for their new case. + +The source code contains a few dozen files, organized around the pieces of the algorithm and major functionalities: + +* ``PeleLMEvolve``: top level time advance loop, with IO/exit controls +* ``PeleLMSetup``: setting up the simulation parameters, parsing the input file +* ``PeleLMInit``: generating the initial solution from scratch or checkpoint file, performing initial projections/iteration(s) +* ``PeleLMAdvance``: top level implementation of the time step algorithm +* ``PeleLMProjection``: implement the various flavors of the nodal projection +* ``PeleLMUmac``: implement the construction and projection of MAC-velocities +* ``PeleLMAdvection``: functions to compute the explicit advection terms +* ``PeleLMDiffusion``: functions to compute the diffusion terms, using the operators defined in ``DiffusionOp`` +* ``PeleLMReaction``: function using *PelePhysics* reactors to integrate the chemistry and linearized advection/diffusion +* ``PeleLMPlot``: implementation of plotfile and checkpoint file IOs +* ``PeleLMBC``: functions filling the ghost cells (at fine/fine, coarse/fine and domain boundaries) +* ``PeleLMRegrid``: creating new AMR level or remaking modified AMR level during adaptive refinement +* ``PeleLMTagging``: mark cells for refinement +* ``PeleLM_K.H``: low-level kernel functions + +Data structure and containers +----------------------------- + +AMReX 101 +^^^^^^^^^ + +The basic AMReX`s data structure is the `MultiFab `_ +(historically, multi Fortran Array Box (FAB)). +Within the block-structured AMR approach of AMReX, the domain is decomposed into non-overlapping rectangular `boxes`, +which can be assembled into a `boxArray`. Each AMR level has a `boxArray` providing the list of `boxes` of that level. +The `boxes` are distributed accross the MPI ranks, the mapping of which is described by a `DistributionMap`. Given a +`boxArray` and a `DistributionMap`, one can define an actual data container (`boxes` are only lightweight descriptor +of the geometrical rectangular object, containing bounds and centering information only), where each rank will +allocate a FAB for the boxes it owns in the `boxArray`, resulting in a collection of FABs or a MultiFab, distributed +accross the MPI ranks. + +To access the data in a MultiFab, one uses a `MFIter `_ +(or MultiFab iterator), which provides each MPI rank access to the FABs it owns within the MultiFab. Actual access to the data in +memory is then provided by the lightweight `Array4` structure and it is strongly advised to rely on AMReX +`ParallelFor `_ function template to loop through the logical `i,j,k` indexes. +For example, to set the velocity data stored in a MultiFab called `NewState` with data from an `OldState` and an increment +from a third MultiFab `advTerm`: :: + + for (MFIter mfi(State,TilingIfNotGPU()); mfi.isValid(); ++mfi) { + Box const& bx = mfi.tilebox(); + auto const& velo_old = OldState.const_array(mfi,VELOCITY_INDEX); + auto const& incr = advTerm.const_array(mfi); + auto const& velo_new = NewState.array(mfi,VELOCITY_INDEX); + ParallelFor(bx, + [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + velo_new(i,j,k) = velo_old(i,j,k) + incr(i,j,k); + }); + } + +.. note:: + For the example above to function, all three MultiFabs must have the same `boxArray` and `DistributionMap` + +Users are strongly encouraged to review of the content of `AMReX documentation `_ +to get more familiar with AMReX data structures and environment. + +*PeleLMeX* state and advance containers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The state vector of *PeleLMeX* contains the 2 or 3 components of velocity, the mixture density, species density (rhoYs), +rhoH, temperature and the thermodynamic pressure. The state components are stored in a cell-centered MultiFab with +`NVAR` components. Additionnally, the perturbational pressure stored at the nodes is contained in a separate MultiFab. +Together with the cell-centered pressure gradient, the cell-centered divergence constraint and cell-centered +transport properties, these MultiFabs are assembled into a `LevelData` struct. + +Each level in the AMR hierarchy have two versions of the `LevelData` at any point during the simulation: one +for the old state and one for the new state. The developer can get a pointer to the `LevelData` struct by +calling : :: + + auto ldata_p = getLevelDataPtr(lev,AmrOldTime); + +with either `AmrOldTime` or `AmrNewTime` on level `lev`. Additionnally, calling this function with +`AmrHalfTime` with return a `LevelData` struct whose `state` is a linearly interpolated between the old and new +states (but the other MultiFab in `LevelData` are empty !). +It is also often useful to have access to a vector of a state component accross the entire AMR hierarchy. To do so, *PeleLMeX* +provides a set of functions returning a vector of MultiFab `std::unique_ptr` aliased into the `LevelData` +MultiFab on each level: :: + + getStateVect(time); # Return the entire state (ncomp: NVAR) + getVelocityVect(time); # Return the velocity only (ncomp: AMREX_SPACEDIM) + getDensityVect(time); # Return the mixture density (ncomp: 1) + getSpeciesVect(time); # Return the species density (ncomp: NUM_SPECIES) + getRhoHVect(time); # Return rhoH (ncomp: 1) + getTempVect(time); # Return temperature (ncomp: 1) + getDivUVect(time); # Return divergence constraint (ncomp: 1) + getDiffusivityVect(time); # Return diffusivity (ncomp: NUM_SPECIES+2) + getViscosityVect(time); # Return viscosity (ncomp: 1) + +where ``time`` can either be `AmrOldTime` or `AmrNewTime`. +Also available at any point during the simulation is the `LevelDataReact` which contains the species +chemical source terms. A single version of the container is avaible on each level and can be accessed +using: :: + + auto ldataR_p = getLevelDataReactPtr(lev); + +Within the time-advance function, the *PeleLMeX* algorithm calls for the computation of the advection, +diffusion and reaction source terms iteratively using SDC. At each step, the results of other steps +can be used as part of the numerical scheme (e.g. the explicit advection with a Godunov scheme uses +the diffusion term). These temporary variables, only useful in the scope of the advance function, are +assembled into two structs: ``AdvanceDiffData`` and ``AdvanceAdvData``. The former contains three +MultiFabs for the separate diffusion term evaluations described in :numref:`LMeX_Algo`: :math:`D^n`, +:math:`D^{n+1,k}` and :math:`D^{n+1,k+1}`, as well as additional containers for the :math:`\overline{W}` +and Soret contributions. The later encapsulate the face-centered MAC velocities :math:`U_{ADV}`, the +advection term :math:`A_{n+1/2,(k+1)}`, the pressure correction :math:`\chi` and a forcing container +used in the RHS of advection/diffusion/reaction solves. In contrast with the `LevelData`, these two containers +are freed at the end of the advance function, and are passed around in the functions called in `advance()`. + +Parallelism +----------- + +*PeleLMeX* inherits the MPI+X approach from the AMReX library, where X can be any of CUDA, HIP or SYCL +(Open-MP is currently only available for non-reacting/explicit chemistry integrators) for heterogeneous architectures. +The reader is referred to `AMReX GPU documentation `_ for more details on +the thread parallelism. + +As mentioned above, the top-level spatial decomposition arises from AMReX's block-structured approach. On each level, non-overlapping +`boxes` are assembled into `boxArray` and distributed accross MPI rank with `DistributionMap` (or `DMap`). +It is in our best interest to ensure that all the MultiFab in the code use the same `boxArray` and `DMap`, +such that operation using `MFIter` can be performed and data copy accross MPI ranks is minimized. +However, it is also important to maintain a good load balancing, i.e. ensure that each MPI rank has the same amount +of work, to avoid wasting computational ressource. Reactive flow simulation are challenging, because the chemistry +integration is very spatially heterogeneous, with stiff ODE integration required within the flame front and non-stiff +integration of the linearized advection/diffusion required in the cold gases or burnt mixture. Additionnally, because +a non-subcycling approach is used in *PeleLMeX*, the chemistry doesn't have to be integrated in fine-covered region. +Two `boxArray` and associated `DMap` are thus available in *PeleLMeX*: + +1. The first one is inherited from `AmrCore` and is availble as ``grid[lev]`` (`boxArray`) and ``dmap[lev]`` (`DMap`) throughout the code. Most + of *PeleLMeX* MultiFabs use these two, and the `boxes` sizes are dictated by the `amr.max_grid_size` and `amr.blocking_factor` from the input + file. These are employed for all the operations in the code except the chemistry integration. The default load balancing approach is to use + space curve filling (SCF) with each box weighted by the number of cells in each box. Advanced users can try alternate appraoch using the + keys listed in :doc:`LMeXControls`. +2. A second one is created, masking fine-covered regions and updated during regrid operations. It is used to perform the chemistry integration, + and because this is a purely local integration (in contrast with implicit diffusion solve for instance, which require communications + to solve the linear problem using GMG), a Knapsack load balancing approach is used by default, where the weight of each box is based + on the total number of chemistry RHS calls in the box. The size of the `boxes` in the chemistry `boxArray` (accessible with ``m_baChem[lev]``) + is controled by the `peleLM.max_grid_size_chem` in the input file. Once again, advanced users can try alternate approaches to load + balancing the chemistry `DMap` using the keys described in :doc:`LMeXControls`. + +After each regrid operation, even if the grids did not actually change, *PeleLMeX* will try to find a better load balancing for the +`AmrCore` `DMap`. Because changing the load balancing requires copying data accross MPI ranks, we only want to change the `DMap` +only if a significantly better new `DMap` can be obtained, with the threshold for a better `DMap` defined based on the value of +`peleLM.load_balancing_efficiency_threshold`. + +Debugging +--------- + +The first step to debug anyh addition or undefined behavior of *PeleLMeX* is to turn the ``DEBUG`` flag ``ON`` in the +GNUmakefile and activate AMReX`s floating point exception traps in the input file: :: + + amrex.fpe_trap_invalid = 1 + amrex.fpe_trap_zero = 1 + amrex.fpe_trap_overflow = 1 + +This will slow down the code considerably, but will enable bound checks on all AMReX low-level data structure, +catch floating point errors (using nans, dividing by zero, ...) and any ``AMREX_ASSERT`` statement added to the +code base. It is also often useful to visualize data in order to understand the erroneous results the solver can +return. Developers can write to disk a single MultiFab using AMReX `VisMF`: :: + + VisMF::Write(myMF,"VisMyMF"); + +and can be visualized with `Amrvis` using `amrvis -mf`. Alternatively, visualizing the entire AMR hierarchy is also +useful. *PeleLMeX* provides a simple function to write a vector of MultiFab: :: + + WriteDebugPlotFile(GetVecOfConstPtrs(getTempVect()),"TempDebug"); + +which can be opened with `Amrvis` or any other visualization software. This last function will function providing that +the MultiFabs in the vector all have the same number of components. +Finally, another way of checking individual pieces of the algorithm is to use *PeleLMeX* evaluate mode ``peleLM.run_mode=evaluate`` +and specify a list of fields with ``peleLM.evaluate_vars`` as described in :doc:`LMeXControls`. Note that not all of the +algorithm is available in this mode yet. diff --git a/Docs/source/manual/LMeXControls.rst b/Docs/source/manual/LMeXControls.rst index 8eca64cf..791b590a 100644 --- a/Docs/source/manual/LMeXControls.rst +++ b/Docs/source/manual/LMeXControls.rst @@ -52,6 +52,27 @@ Grid/AMR parameters amr.blocking_factor = 16 # block factor in grid generation (min box size) amr.max_grid_size = 64 # max box size + peleLM.max_grid_size_chem = 32 # [OPT, DEF="None"] Max box size for the Chemistry BoxArray + +Load balancing +-------------- + +*PeleLMeX* relies on two distribution maps (see :doc:`Implementation` for more details). + +:: + + peleLM.do_load_balancing = 1 # [OPT, DEF=0] Activate load balancing + peleLM.load_balancing_method = sfc # [OPT, DEF="sfc"] AmrCore dmap load balancing method + peleLM.load_balancing_cost_estimate = ncell # [OPT, DEF="ncell"] AmrCore dmap balancing cost + peleLM.chem_load_balancing_method = knapsack # [OPT, DEF="knapsack"] Chemistry dmap load balancing method + peleLM.chem_load_balancing_cost_estimate = chemfunctcall_sum # [OPT, DEF="chemfunctcall_sum"] Chemistry dmap balancing cost + peleLM.load_balancing_efficiency_threshold = 1.05 # What constitute a better dmap ? + +The balancing method can be one of `sfc`, `roundrobin` or `knapsack`, while the cost estimate can be one of +`ncell`, `chemfunctcall_avg`, `chemfunctcall_max`, `chemfunctcall_sum`, `userdefined_avg` or `userdefined_sum`. When +using either of the last to option, the user must provide a definition for the `derUserDefined`. If multiple components +are defined in the `derUserDefined` function, the first one is used for load balancing. + Time stepping parameters ------------------------ @@ -125,7 +146,6 @@ overview of the available controls: The `field_name` can be any of the state or derived variables (see below) component. Additional controls specific to embedded boundaries are discussed below. - PeleLMeX derived variables -------------------------- @@ -186,8 +206,17 @@ The following list of derived variables are available in PeleLMeX: * - `coordinates` - AMREX_SPACEDIM - Cell-center coordinates + * - `DistributionMap` + - 1 + - The MPI-rank of each box + * - `derUserDefined` + - ? + - A user-defined derived which number of components is provided by the user (see below). Note that `mixture_fraction` and `progress_variable` requires additional inputs from the users as described below. +The `derUserDefined` allow the user to define its own derived variable which can comprise several components. To do +so, the user need to copy the Source/DeriveUserDefined.cpp file into his run folder and update the file. The number of +components is defined based on the size of the vector returned by pelelm_setuserderives(). PeleLMeX algorithm ------------------ diff --git a/Docs/source/manual/Model.rst b/Docs/source/manual/Model.rst index 19bc01ed..09a3813a 100644 --- a/Docs/source/manual/Model.rst +++ b/Docs/source/manual/Model.rst @@ -40,7 +40,7 @@ In a nutshell, `PeleLMeX` features include: Mathematical background ----------------------- -`PeleLMeX` evolves chemically reacting low Mach number flows with block-structured adaptive mesh refinement (AMR). The code depends upon the `AMReX `_ library to provide the underlying data structures, and tools to manage and operate on them across massively parallel computing architectures. `PeleLMeX` also utilizes the source code and algorithmic infrastructure of `AMReX-Hydro `_. `PeleLMeX` borrows heavily from `PeleLM `_. The core algorithms in `PeleLM` are described in the following papers: +`PeleLMeX` evolves chemically reacting low Mach number flows with block-structured adaptive mesh refinement (AMR). The code depends upon the `AMReX `_ library to provide the underlying data structures, and tools to manage and operate on them across massively parallel computing architectures. `PeleLMeX` also utilizes the source code and algorithmic infrastructure of `AMReX-Hydro `_. `PeleLMeX` borrows heavily from `PeleLM`_. The core algorithms in `PeleLM` are described in the following papers: * *A conservative, thermodynamically consistent numerical approach for low Mach number combustion. I. Single-level integration*, A. Nonaka, J. B. Bell, and M. S. Day, *Combust. Theor. Model.*, **22** (1) 156-184 (2018) @@ -172,12 +172,15 @@ and simplified velocity constraint, PeleLMeX Algorithm ------------------ -An overview of `PeleLMeX` time-advance function is provided in the figure below and details are provided in the following subsections. +An overview of `PeleLMeX` time-advance function is provided in :numref:`LMeX_Algo` and details are provided in the following subsections. .. figure:: images/model/PeleLMeX_Algorithm.png + :name: LMeX_Algo :align: center :figwidth: 50% + : Flowchart of the *PeleLMeX* advance function. + The three steps of the low Mach number projection scheme described :ref:`below ` are referenced to better emphasize how the thermodynamic solve is closely weaved into the fractional step appraoch. Striped boxes indicate where the :ref:`Godunov procedure ` is employed while the four different linear solves are highlighted. @@ -340,7 +343,7 @@ Note that in the presence of EB, only the `Godunov_PLM` variant is available. AMR extension ^^^^^^^^^^^^^ -In contrast with `PeleLM `_, `PeleLMeX` do not rely a on subcycling appraoch to advance the AMR hierarchy. +In contrast with `PeleLM`_, `PeleLMeX` do not rely a on subcycling appraoch to advance the AMR hierarchy. This difference is illustrated in the figure below comparing the multi-level time-stepping approach in both codes: .. figure:: images/model/PeleLMeX_Subcycling.png diff --git a/Docs/source/manual/index.rst b/Docs/source/manual/index.rst index 5329542a..271bf139 100644 --- a/Docs/source/manual/index.rst +++ b/Docs/source/manual/index.rst @@ -22,6 +22,7 @@ point your web browser at the file ``${PELELMEX_HOME}/Docs/build/html/index.html :caption: Theory: Model.rst + Implementation.rst Validation.rst Performances.rst diff --git a/Source/CMakeLists.txt b/Source/CMakeLists.txt index 2ab506fa..c144a5eb 100644 --- a/Source/CMakeLists.txt +++ b/Source/CMakeLists.txt @@ -42,7 +42,9 @@ target_sources(pelelmex Utils.cpp ) -add_subdirectory(Diagnostics) +if (PELELMEX_USE_DIAGS) + add_subdirectory(Diagnostics) +endif () if (PELELMEX_USE_EFIELD) add_subdirectory(Efield) diff --git a/Source/PeleLM.H b/Source/PeleLM.H index 96a96bcd..82e359b1 100644 --- a/Source/PeleLM.H +++ b/Source/PeleLM.H @@ -1,29 +1,36 @@ #ifndef _PeleLM_H_ #define _PeleLM_H_ +// PeleLMeX includes +#include "PeleLMUserKeys.H" +#include "PeleLM_Index.H" +#include "PeleLMDerive.H" +#include "pelelm_prob_parm.H" +#include "DiffusionOp.H" +#include "DiagBase.H" +#ifdef PELE_USE_EFIELD +#include "PrecondOp.H" +#endif + +// PelePhysics lib +#include +#include +#include +#include +#include + +// AMReX-Hydro lib +#include +#include +#include + +// AMReX lib #include #include #include -#include -#include #include #include -#include -#include -#include "PelePhysics.H" -#include "PMFData.H" -#include "turbinflow.H" -#include "ReactorBase.H" -#include -#include -#include -#include -#include - -#ifdef PELE_USE_EFIELD -#include -#endif // Forward declarations #ifdef PELELM_USE_SPRAY @@ -45,7 +52,7 @@ class PeleLM : public amrex::AmrCore { // constructor PeleLM(); - //destructor + // destructor virtual ~PeleLM(); // Setup function @@ -105,8 +112,8 @@ class PeleLM : public amrex::AmrCore { void readProbParm(); // ReadGridFile - void readGridFile(std::string grid_file, - amrex::Vector& input_ba); + void readGridFile(std::string grid_file, + amrex::Vector& input_ba); #ifdef PELELM_USE_SPRAY // ReadSprayParameters @@ -249,6 +256,10 @@ class PeleLM : public amrex::AmrCore { // checkRunParams void checkRunParams(); + void computeCosts(int a_lev); + void computeCosts(int a_lev, amrex::LayoutData &a_costs, + int a_costMethod); + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- @@ -617,6 +628,7 @@ class PeleLM : public amrex::AmrCore { void fillpatch_gradp(int lev, amrex::Real a_time, amrex::MultiFab& a_gp, int nGhost); void fillpatch_reaction(int lev, amrex::Real a_time, amrex::MultiFab& a_I_R, int nGhost); void fillpatch_forces(amrex::Real a_time, amrex::Vector const &a_velForces, int nGrowForce); + void fillpatch_chemFunctCall(int lev, amrex::Real a_time, amrex::MultiFab& a_fctC, int nGhost); #ifdef PELE_USE_EFIELD void fillpatch_phiV(int lev, amrex::Real a_time, amrex::MultiFab& a_phiV, int phiV_comp, int nGhost); void fillPatchExtrap(amrex::Real a_time, amrex::Vector const &a_MF, int a_nGrow); @@ -629,6 +641,7 @@ class PeleLM : public amrex::AmrCore { void fillcoarsepatch_divu(int lev, amrex::Real a_time, amrex::MultiFab& a_divu, int nGhost); void fillcoarsepatch_gradp(int lev, amrex::Real a_time, amrex::MultiFab& a_gp, int nGhost); void fillcoarsepatch_reaction(int lev, amrex::Real a_time, amrex::MultiFab& a_I_R, int nGhost); + void fillcoarsepatch_chemFunctCall(int lev, amrex::Real a_time, amrex::MultiFab& a_fctC, int nGhost); // Fill physical boundaries void setInflowBoundaryVel (amrex::MultiFab &a_vel, int lev, PeleLM::TimeStamp a_time); @@ -649,6 +662,50 @@ class PeleLM : public amrex::AmrCore { //----------------------------------------------------------------------------- // UTILS + template + void parseUserKey(amrex::ParmParse &a_pp, + const std::string &a_search_key, + const K &keyT, + int &key, + int keyIndex = -1) + { + std::string userkey = "default"; + if (keyIndex < 0 ) { + a_pp.query(a_search_key.c_str(), userkey); + } else { + int search_key_size = a_pp.countval(a_search_key.c_str()); + if (search_key_size > 0) { + AMREX_ASSERT(keyIndex < search_key_size); + amrex::Vector userkeyVec(search_key_size); + a_pp.getarr(a_search_key.c_str(),userkeyVec,keyIndex,1); + userkey = userkeyVec[keyIndex]; + } + } + std::transform(userkey.begin(), userkey.end(), userkey.begin(), ::tolower); + if (std::find(std::begin(keyT.searchKey), + std::end(keyT.searchKey), a_search_key) + == std::end(keyT.searchKey)) { + std::string errmsg = "Improper use of parseUserKey: search_key incompatible with provided userKey. \n"; + errmsg += "The valid search keys are:\n"; + for ( const auto &validKey : keyT.searchKey ) { + errmsg += " - " + validKey + " \n"; + } + amrex::Abort(errmsg); + } + + if (keyT.str2int.count(userkey) == 0) { + std::string errmsg = "Invalid key for search_key " + a_search_key + ". \n"; + errmsg += "Valid keys are:\n"; + for ( const auto &valid_pair : keyT.str2int ) { + if (valid_pair.first != "default"){ + errmsg += " - " + valid_pair.first + " \n"; + } + } + amrex::Abort(errmsg); + } + + key = keyT.str2int.at(userkey); + }; // flux divergence void fluxDivergence(const amrex::Vector &a_divergence, @@ -724,6 +781,9 @@ class PeleLM : public amrex::AmrCore { void floorSpecies(const PeleLM::TimeStamp &a_time); + void loadBalanceChem(); + void loadBalanceChemLev(int a_lev); + amrex::Real MLNorm0(const amrex::Vector &a_MF); amrex::Vector MLNorm0(const amrex::Vector &a_MF, @@ -1099,12 +1159,12 @@ class PeleLM : public amrex::AmrCore { amrex::Vector m_regrid_ba; // Problem parameters - ProbParm* prob_parm; - ProbParm* prob_parm_d; + ProbParm* prob_parm = nullptr; + ProbParm* prob_parm_d = nullptr; #ifdef PELELM_USE_SOOT // Soot parameters - SootModel* soot_model; + SootModel* soot_model = nullptr; bool do_soot_solve; #endif @@ -1126,14 +1186,6 @@ class PeleLM : public amrex::AmrCore { amrex::Vector> m_diagnostics; amrex::Vector m_diagVars; - // Linear solvers - std::unique_ptr m_diffusion_op; - std::unique_ptr m_mcdiffusion_op; - std::unique_ptr m_diffusionTensor_op; - std::unique_ptr macproj; - int m_macProjNeedReset = 0; - int m_macProjOldSize = 0; - int m_verbose = 0; // IO options @@ -1235,7 +1287,18 @@ class PeleLM : public amrex::AmrCore { // Misc. int m_floor_species = 0; - int m_checkMem = 0; + + // Performances + int m_checkMem {0}; + int m_doLoadBalance {0}; + int m_loadBalanceCost {LoadBalanceCost::Ncell}; + int m_loadBalanceMethod {LoadBalanceMethod::SFC}; + int m_loadBalanceCostChem {LoadBalanceCost::ChemFunctCallMax}; + int m_loadBalanceMethodChem {LoadBalanceMethod::Knapsack}; + amrex::Real m_loadBalanceKSfactor {1.2}; + amrex::Real m_loadBalanceEffRatioThreshold {1.1}; + amrex::Vector > > m_costs; + amrex::Vector m_loadBalanceEff; // SDC int m_nSDCmax = 1; @@ -1312,7 +1375,7 @@ class PeleLM : public amrex::AmrCore { amrex::Vector typical_values; #ifdef AMREX_USE_EB - // EB refinement + // EB std::string m_EB_refine_type = "Static"; int m_EB_refine_LevMax = -1; int m_EB_refine_LevMin = -1; @@ -1353,6 +1416,12 @@ class PeleLM : public amrex::AmrCore { //----------------------------------------------------------------------------- // Linear Solvers + std::unique_ptr m_diffusion_op; + std::unique_ptr m_mcdiffusion_op; + std::unique_ptr m_diffusionTensor_op; + std::unique_ptr macproj; + int m_macProjNeedReset {0}; + int m_macProjOldSize {0}; // Nodal projection int m_nodal_mg_max_coarsening_level = 100; diff --git a/Source/PeleLMBC.cpp b/Source/PeleLMBC.cpp index 05a56033..a2df51a2 100644 --- a/Source/PeleLMBC.cpp +++ b/Source/PeleLMBC.cpp @@ -638,6 +638,36 @@ void PeleLM::fillpatch_reaction(int lev, } } +// Fill functC +void PeleLM::fillpatch_chemFunctCall(int lev, + const amrex::Real a_time, + amrex::MultiFab &a_fctC, + int nGhost) { + ProbParm const* lprobparm = prob_parm_d; + if (lev == 0) { + PhysBCFunct> bndry_func(geom[lev], {m_bcrec_force}, + PeleLMCCFillExtDirDummy{lprobparm, m_nAux}); + FillPatchSingleLevel(a_fctC, IntVect(nGhost), a_time, + {&(m_leveldatareact[lev]->functC)},{a_time}, + 0, 0, 1, geom[lev], bndry_func, 0); + } else { + + // Interpolator + auto* mapper = getInterpolator(); + + PhysBCFunct> crse_bndry_func(geom[lev-1], {m_bcrec_force}, + PeleLMCCFillExtDirDummy{lprobparm, m_nAux}); + PhysBCFunct> fine_bndry_func(geom[lev], {m_bcrec_force}, + PeleLMCCFillExtDirDummy{lprobparm, m_nAux}); + FillPatchTwoLevels(a_fctC, IntVect(nGhost), a_time, + {&(m_leveldatareact[lev-1]->functC)},{a_time}, + {&(m_leveldatareact[lev]->functC)},{a_time}, + 0, 0, 1, geom[lev-1], geom[lev], + crse_bndry_func, 0, fine_bndry_func, 0, + refRatio(lev-1), mapper, {m_bcrec_force}, 0); + } +} + // Fill the state void PeleLM::fillcoarsepatch_state(int lev, const amrex::Real a_time, @@ -730,6 +760,28 @@ void PeleLM::fillcoarsepatch_reaction(int lev, refRatio(lev-1), mapper, {m_bcrec_force}, 0); } +// Fill coarse patch of chem function call +void PeleLM::fillcoarsepatch_chemFunctCall(int lev, + const amrex::Real a_time, + amrex::MultiFab &a_fctC, + int nGhost) { + ProbParm const* lprobparm = prob_parm_d; + + // Interpolator + auto* mapper = getInterpolator(m_regrid_interp_method); + + PhysBCFunct> crse_bndry_func(geom[lev-1], {m_bcrec_force}, + PeleLMCCFillExtDirDummy{lprobparm, m_nAux}); + PhysBCFunct> fine_bndry_func(geom[lev], {m_bcrec_force}, + PeleLMCCFillExtDirDummy{lprobparm, m_nAux}); + InterpFromCoarseLevel(a_fctC, IntVect(nGhost), a_time, + m_leveldatareact[lev-1]->functC, 0, 0, 1, + geom[lev-1], geom[lev], + crse_bndry_func, 0, fine_bndry_func, 0, + refRatio(lev-1), mapper, {m_bcrec_force}, 0); +} + + // Fill the inflow boundary of a velocity MF // used for velocity projection void PeleLM::setInflowBoundaryVel(MultiFab &a_vel, diff --git a/Source/PeleLMDeriveFunc.H b/Source/PeleLMDeriveFunc.H index d763c07e..ebbfd4c6 100644 --- a/Source/PeleLMDeriveFunc.H +++ b/Source/PeleLMDeriveFunc.H @@ -98,6 +98,11 @@ void pelelm_derlambda (PeleLM* a_pelelm, const amrex::Box& bx, amrex::FArrayBox& const amrex::Geometry& geom, amrex::Real time, const amrex::Vector &bcrec, int level); +void pelelm_derdmap (PeleLM* a_pelelm, const amrex::Box& bx, amrex::FArrayBox& derfab, int dcomp, int ncomp, + const amrex::FArrayBox& statefab, const amrex::FArrayBox& reactfab, const amrex::FArrayBox& pressfab, + const amrex::Geometry& geom, + amrex::Real time, const amrex::Vector &bcrec, int level); + amrex::Vector pelelm_setuserderives(); void pelelm_deruserdef (PeleLM* a_pelelm, const amrex::Box& bx, amrex::FArrayBox& derfab, int dcomp, int ncomp, diff --git a/Source/PeleLMDeriveFunc.cpp b/Source/PeleLMDeriveFunc.cpp index 0d82c0aa..82c98509 100644 --- a/Source/PeleLMDeriveFunc.cpp +++ b/Source/PeleLMDeriveFunc.cpp @@ -672,3 +672,22 @@ void pelelm_derlambda (PeleLM* a_pelelm, const Box& bx, FArrayBox& derfab, int d } }); } + +// +// Extract Distribution Mapping +// +void pelelm_derdmap (PeleLM* a_pelelm, const Box& bx, FArrayBox& derfab, int dcomp, int /*ncomp*/, + const FArrayBox& /*statefab*/, const FArrayBox& /*reactfab*/, const FArrayBox& /*pressfab*/, + const Geometry& /*geom*/, + Real /*time*/, const Vector& /*bcrec*/, int /*level*/) + +{ + AMREX_ASSERT(derfab.box().contains(bx)); + auto der = derfab.array(dcomp); + const int myrank = ParallelDescriptor::MyProc(); + amrex::ParallelFor(bx, + [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + der(i,j,k) = myrank; + }); +} diff --git a/Source/PeleLMInit.cpp b/Source/PeleLMInit.cpp index 5176937d..a60bd3d1 100644 --- a/Source/PeleLMInit.cpp +++ b/Source/PeleLMInit.cpp @@ -88,6 +88,9 @@ void PeleLM::MakeNewLevelFromScratch( int lev, m_t_new[lev] = time; m_t_old[lev] = time - 1.0e200; + // Load balance + m_costs[lev] = std::make_unique>(ba, dm); + // Mac projector #ifdef AMREX_USE_EB macproj.reset(new Hydro::MacProjector(Geom(0,finest_level), @@ -247,6 +250,9 @@ void PeleLM::initData() { advanceChemistry(lev, dtInit/2.0, Forcing); } } + if (m_doLoadBalance) { + loadBalanceChemLev(lev); + } } // Copy back old -> new copyStateOldToNew(); diff --git a/Source/PeleLMReactions.cpp b/Source/PeleLMReactions.cpp index b5f130e3..141e3b09 100644 --- a/Source/PeleLMReactions.cpp +++ b/Source/PeleLMReactions.cpp @@ -25,8 +25,7 @@ void PeleLM::advanceChemistry(std::unique_ptr &advData) } // This advanceChemistry is called on the finest level -// It works with the AmrCore BoxArray and do not involve ParallelCopy and averaged down -// version of I_R +// It works with the AmrCore BoxArray and do not involve ParallelCopy void PeleLM::advanceChemistry(int lev, const Real &a_dt, MultiFab &a_extForcing) diff --git a/Source/PeleLMRegrid.cpp b/Source/PeleLMRegrid.cpp index 8660e846..7cf8bb35 100644 --- a/Source/PeleLMRegrid.cpp +++ b/Source/PeleLMRegrid.cpp @@ -6,11 +6,253 @@ void PeleLM::regrid(int lbase, amrex::Real time, bool initial) { - if (lbase >= max_level) return; + BL_PROFILE("PeleLM::regrid()"); + + if (!m_doLoadBalance && lbase >= max_level) return; + if (!m_regrid_file.empty()) { regridFromGridFile(lbase, time, initial); + } else { - AmrCore::regrid(lbase, time, initial); + + // Load balance base grid + if (lbase == 0 && m_doLoadBalance && !initial) { + + if ( m_verbose > 0) { + Print() << " Load balancing level 0 \n"; + } + + int remakeLevel = false; + + computeCosts(0); + + // Use efficiency: average MPI rank cost / max cost + amrex::Real currentEfficiency = 0.0; + amrex::Real testEfficiency = 0.0; + + DistributionMapping test_dmap; + // Build the test dmap, w/o braodcasting + if (m_loadBalanceMethod == LoadBalanceMethod::SFC) { + + test_dmap = DistributionMapping::makeSFC(*m_costs[0], + currentEfficiency, testEfficiency, + false, + ParallelDescriptor::IOProcessorNumber()); + + } else if (m_loadBalanceMethod == LoadBalanceMethod::Knapsack) { + + const amrex::Real navg = static_cast(grids[0].size()) / + static_cast(ParallelDescriptor::NProcs()); + const int nmax = static_cast(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg))); + test_dmap = DistributionMapping::makeKnapSack(*m_costs[0], + currentEfficiency, testEfficiency, + nmax, + false, + ParallelDescriptor::IOProcessorNumber()); + } + + // IO proc determine if the test dmap offers significant improvements + if ((m_loadBalanceEffRatioThreshold > 0.0) + && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())) + { + remakeLevel = remakeLevel || (testEfficiency > m_loadBalanceEffRatioThreshold*currentEfficiency); + } + ParallelDescriptor::Bcast(&remakeLevel, 1, ParallelDescriptor::IOProcessorNumber()); + + if ( m_verbose > 1 && remakeLevel) { + Print() << " Current LoadBalancing efficiency: " << currentEfficiency << "\n" + << " Test LoadBalancing efficiency: " << testEfficiency << " \n"; + } + + // Bcast the test dmap and remake level + if (remakeLevel) { + Vector pmap; + if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) { + pmap = test_dmap.ProcessorMap(); + } else { + pmap.resize(static_cast(grids[0].size())); + } + ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber()); + + if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) { + test_dmap = DistributionMapping(pmap); + } + + RemakeLevel(0, time, grids[0], test_dmap); + SetDistributionMap(0, test_dmap); + } + } + + if (lbase >= max_level) return; + + int new_finest; + Vector new_grids(finest_level+2); + MakeNewGrids(lbase, time, new_finest, new_grids); + + BL_ASSERT(new_finest <= finest_level+1); + + bool coarse_ba_changed = false; + for (int lev = lbase+1; lev <= new_finest; ++lev) + { + if (lev <= finest_level) // an old level + { + bool ba_changed = (new_grids[lev] != grids[lev]); + + // If initial grid generation or not doing load balancing, + // remake level only if the grid changed using the default DMap strategy + if ( initial || !m_doLoadBalance ) { + if ( ba_changed || coarse_ba_changed ) { + BoxArray level_grids = grids[lev]; + DistributionMapping level_dmap = dmap[lev]; + if (ba_changed) { + level_grids = new_grids[lev]; + level_dmap = DistributionMapping(level_grids); + } + const auto old_num_setdm = num_setdm; + RemakeLevel(lev, time, level_grids, level_dmap); + SetBoxArray(lev, level_grids); + if (old_num_setdm == num_setdm) { + SetDistributionMap(lev, level_dmap); + } + } + + // When doing load balancing, remake level if grid changed or a better dmap + // can be obtained + } else { + + int remakeLevel = false; + + BoxArray new_ba; + DistributionMapping new_dmap; + + // If the grid changed, let's build a new dmap + if (ba_changed) { + + remakeLevel = true; + + new_ba = new_grids[lev]; + new_dmap = DistributionMapping(new_ba); + + // Get the cost on a LayoutData associated with the new grid + LayoutData new_cost(new_ba,new_dmap); + computeCosts(lev, new_cost, m_loadBalanceCost); + + if (m_loadBalanceMethod == LoadBalanceMethod::SFC) { + Vector costsVec(new_ba.size()); + ParallelDescriptor::GatherLayoutDataToVector(new_cost, costsVec, + ParallelContext::IOProcessorNumberSub()); + ParallelDescriptor::Bcast(costsVec.data(), costsVec.size(), ParallelContext::IOProcessorNumberSub()); + Real efficiency; + new_dmap = DistributionMapping::makeSFC(costsVec, new_ba, efficiency); + + } else if (m_loadBalanceMethod == LoadBalanceMethod::Knapsack) { + + const amrex::Real navg = static_cast(new_ba.size()) / + static_cast(ParallelDescriptor::NProcs()); + const int nmax = static_cast(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg))); + Vector costsVec(new_ba.size()); + ParallelDescriptor::GatherLayoutDataToVector(new_cost, costsVec, + ParallelContext::IOProcessorNumberSub()); + ParallelDescriptor::Bcast(costsVec.data(), costsVec.size(), ParallelContext::IOProcessorNumberSub()); + Real efficiency; + new_dmap = DistributionMapping::makeKnapSack(costsVec, efficiency, nmax); + } + + // Let's see if we can get a better dmap + } else { + + if ( m_verbose > 1) { + Print() << " Load balancing level " << lev << "\n"; + } + + // Try to build a new dmap with the same old grid + new_ba = grids[lev]; + DistributionMapping test_dmap; + + computeCosts(lev); + + // Use efficiency: average MPI rank cost / max cost + amrex::Real currentEfficiency = 0.0; + amrex::Real testEfficiency = 0.0; + + // Build the test dmap, w/o braodcasting + if (m_loadBalanceMethod == LoadBalanceMethod::SFC) { + + test_dmap = DistributionMapping::makeSFC(*m_costs[lev], + currentEfficiency, testEfficiency, + false, + ParallelDescriptor::IOProcessorNumber()); + + } else if (m_loadBalanceMethod == LoadBalanceMethod::Knapsack) { + + const amrex::Real navg = static_cast(new_ba.size()) / + static_cast(ParallelDescriptor::NProcs()); + const int nmax = static_cast(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg))); + test_dmap = DistributionMapping::makeKnapSack(*m_costs[lev], + currentEfficiency, testEfficiency, + nmax, + false, + ParallelDescriptor::IOProcessorNumber()); + } + + // IO proc determine if the test dmap offers significant improvements + if ((m_loadBalanceEffRatioThreshold > 0.0) + && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())) + { + remakeLevel = remakeLevel || (testEfficiency > m_loadBalanceEffRatioThreshold*currentEfficiency); + } + ParallelDescriptor::Bcast(&remakeLevel, 1, ParallelDescriptor::IOProcessorNumber()); + + if ( m_verbose > 1 && remakeLevel) { + Print() << " Current LoadBalancing efficiency: " << currentEfficiency << "\n" + << " Test LoadBalancing efficiency: " << testEfficiency << " \n"; + } + + // Bcast the test dmap if we plan on remaking the level + if (remakeLevel) { + Vector pmap; + if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) { + pmap = test_dmap.ProcessorMap(); + } else { + pmap.resize(static_cast(new_ba.size())); + } + ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber()); + + if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) { + test_dmap = DistributionMapping(pmap); + } + new_dmap = test_dmap; + } + } + + if (remakeLevel) { + const auto old_num_setdm = num_setdm; + RemakeLevel(lev, time, new_ba, new_dmap); + SetBoxArray(lev, new_ba); + if (old_num_setdm == num_setdm) { + SetDistributionMap(lev, new_dmap); + } + } + } + coarse_ba_changed = ba_changed; + } else { // a new level, use default DMap strategy + DistributionMapping new_dmap(new_grids[lev]); + const auto old_num_setdm = num_setdm; + MakeNewLevelFromCoarse(lev, time, new_grids[lev], new_dmap); + SetBoxArray(lev, new_grids[lev]); + if (old_num_setdm == num_setdm) { + SetDistributionMap(lev, new_dmap); + } + } + } + + for (int lev = new_finest+1; lev <= finest_level; ++lev) { + ClearLevel(lev); + ClearBoxArray(lev); + ClearDistributionMap(lev); + } + + finest_level = new_finest; } } @@ -25,7 +267,7 @@ void PeleLM::MakeNewLevelFromCoarse( int lev, if (m_verbose > 2) { auto const dx = geom[lev].CellSizeArray(); Real vol = AMREX_D_TERM(dx[0],*dx[1],*dx[2]); - amrex::Print() << " with " << ba.numPts() << " cells," + amrex::Print() << " with " << ba.numPts() << " cells, " << ba.size() << " boxes," << " over " << ba.numPts() * vol / geom[0].ProbSize() * 100 << "% of the domain \n"; } if (m_verbose > 3) { @@ -101,6 +343,9 @@ void PeleLM::MakeNewLevelFromCoarse( int lev, m_precond_op.reset(); #endif + // Load balance + m_costs[lev] = std::make_unique>(ba, dm); + // DiffusionOp will be recreated m_diffusion_op.reset(); m_mcdiffusion_op.reset(); @@ -124,7 +369,7 @@ void PeleLM::RemakeLevel( int lev, if (m_verbose > 2) { auto const dx = geom[lev].CellSizeArray(); Real vol = AMREX_D_TERM(dx[0],*dx[1],*dx[2]); - amrex::Print() << " with " << ba.numPts() << " cells," + amrex::Print() << " with " << ba.numPts() << " cells," << ba.size() << " boxes," << " over " << ba.numPts() * vol / geom[0].ProbSize() * 100 << "% of the domain \n"; } if (m_verbose > 3) { @@ -200,6 +445,9 @@ void PeleLM::RemakeLevel( int lev, m_precond_op.reset(); #endif + // Load balance + m_costs[lev] = std::make_unique>(ba, dm); + // DiffusionOp will be recreated m_diffusion_op.reset(); m_mcdiffusion_op.reset(); @@ -233,6 +481,67 @@ void PeleLM::ClearLevel(int lev) { } #endif m_extSource[lev]->clear(); + + m_costs[lev].reset(); + m_loadBalanceEff[lev] = -1.0; +} + +void PeleLM::computeCosts(int a_lev, LayoutData &a_costs, int a_costMethod) +{ + if (a_costMethod == LoadBalanceCost::Ncell ) { + for (MFIter mfi(a_costs, false); mfi.isValid(); ++mfi) + { + a_costs[mfi] = mfi.validbox().numPts(); + } + } else if (a_costMethod == LoadBalanceCost::ChemFunctCallAvg ) { + MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0); + fillpatch_chemFunctCall(a_lev, m_cur_time, costMF, 0); + for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi) + { + a_costs[mfi] = costMF[mfi].sum(mfi.validbox(),0) / mfi.validbox().numPts(); + } + } else if (a_costMethod == LoadBalanceCost::ChemFunctCallMax ) { + MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0); + fillpatch_chemFunctCall(a_lev, m_cur_time, costMF, 0); + for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi) + { + a_costs[mfi] = costMF[mfi].max(mfi.validbox(),0); + } + } else if (a_costMethod == LoadBalanceCost::ChemFunctCallSum ) { + MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0); + fillpatch_chemFunctCall(a_lev, m_cur_time, costMF, 0); + for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi) + { + a_costs[mfi] = costMF[mfi].sum(mfi.validbox(),0); + } + } else if (a_costMethod == LoadBalanceCost::UserDefinedDerivedAvg) { + MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0); + costMF.setVal(0.0); + std::unique_ptr mf; + mf = derive("derUserDefined", m_cur_time, a_lev, 0); + costMF.ParallelCopy(*mf,0,0,1); + for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi) + { + a_costs[mfi] = costMF[mfi].sum(mfi.validbox(),0) / mfi.validbox().numPts(); + } + } else if (a_costMethod == LoadBalanceCost::UserDefinedDerivedSum) { + MultiFab costMF(a_costs.boxArray(), a_costs.DistributionMap(), 1, 0); + costMF.setVal(0.0); + std::unique_ptr mf; + mf = derive("derUserDefined", m_cur_time, a_lev, 0); + costMF.ParallelCopy(*mf,0,0,1); + for (MFIter mfi(costMF, false); mfi.isValid(); ++mfi) + { + a_costs[mfi] = costMF[mfi].sum(mfi.validbox(),0); + } + } else { + Abort(" Unknown cost estimate method !"); + } +} + +void PeleLM::computeCosts(int a_lev) +{ + computeCosts(a_lev, *m_costs[a_lev], m_loadBalanceCost); } void PeleLM::resetMacProjector() diff --git a/Source/PeleLMSetup.cpp b/Source/PeleLMSetup.cpp index 517f3275..e36b98e4 100644 --- a/Source/PeleLMSetup.cpp +++ b/Source/PeleLMSetup.cpp @@ -408,6 +408,25 @@ void PeleLM::readParameters() { } } + // ----------------------------------------- + // Load Balancing + // ----------------------------------------- + pp.query("do_load_balancing",m_doLoadBalance); + parseUserKey(pp, "load_balancing_method", lbmethod, m_loadBalanceMethod); + parseUserKey(pp, "load_balancing_cost_estimate", lbcost, m_loadBalanceCost); + pp.query("load_balancing_efficiency_threshold",m_loadBalanceEffRatioThreshold); + parseUserKey(pp, "chem_load_balancing_method", lbmethod, m_loadBalanceMethodChem); + parseUserKey(pp, "chem_load_balancing_cost_estimate", lbcost, m_loadBalanceCostChem); + + // Deactivate load balancing for serial runs +#ifdef AMREX_USE_MPI + if (ParallelContext::NProcsSub() == 1) { + m_doLoadBalance = 0; + } +#else + m_doLoadBalance = 0; +#endif + // ----------------------------------------- // Advection // ----------------------------------------- @@ -485,7 +504,7 @@ void PeleLM::readParameters() { ppa.query("max_dt", m_max_dt); ppa.query("min_dt", m_min_dt); - if ( max_level > 0 ) { + if ( max_level > 0 || m_doLoadBalance) { ppa.query("regrid_int", m_regrid_int); ppa.query("regrid_on_restart", m_regrid_on_restart); } @@ -858,6 +877,9 @@ void PeleLM::derivedSetup() } + // Distribution Map + derive_lst.add("DistributionMap",IndexType::TheCellType(),1,pelelm_derdmap,the_same_box); + // Cell average pressure derive_lst.add("avg_pressure",IndexType::TheCellType(),1,pelelm_deravgpress,the_same_box); @@ -1125,8 +1147,13 @@ void PeleLM::resizeArray() { // Time m_t_old.resize(max_level+1); m_t_new.resize(max_level+1); + #ifdef PELELM_USE_SPRAY m_spraystate.resize(max_level+1); m_spraysource.resize(max_level+1); #endif + + // Load balancing + m_costs.resize(max_level+1); + m_loadBalanceEff.resize(max_level+1); } diff --git a/Source/PeleLMUserKeys.H b/Source/PeleLMUserKeys.H new file mode 100644 index 00000000..53df4ef3 --- /dev/null +++ b/Source/PeleLMUserKeys.H @@ -0,0 +1,168 @@ +#ifndef PELE_USERKEYS_H +#define PELE_USERKEYS_H +#include + +#include +#include + +/** + * \brief struct holding PeleLMeX physical BC options + */ +struct BoundaryCondition { + enum { + BCInterior = 0, + BCInflow, + BCOutflow, + BCSymmetry, + BCSlipWallAdiab, + BCNoSlipWallAdiab, + BCSlipWallIsotherm, + BCNoSlipWallIsotherm + }; + const std::map str2int = { + {"interior", BCInterior}, + {"inflow", BCInflow}, + {"outflow", BCOutflow}, + {"symmetry", BCSymmetry}, + {"slipwalladiab", BCSlipWallAdiab}, + {"noslipwalladiab", BCNoSlipWallAdiab}, + {"bcslipwallisotherm", BCSlipWallIsotherm}, + {"bcnoslipwallisotherm", BCNoSlipWallIsotherm} + }; + const amrex::Array searchKey{"lo_bc","hi_bc"}; +}; +const BoundaryCondition boundarycondition; + +/** + * \brief struct holding PeleLMeX NS solver options + default is LowMachNumber + */ +struct NSSolver { + enum { + LowMachNumber = 0, + Incompressible + }; + const std::map str2int = { + {"lowmachnumber", LowMachNumber}, + {"incompressible", Incompressible}, + {"default", LowMachNumber} + }; + const amrex::Array searchKey{"ns_solver"}; +}; +const NSSolver nssolver; + +/** + * \brief struct holding PeleLMeX interpolator options + default is PiecewiseLinearConserv + */ +struct Interpolator { + enum { + PiecewiseLinearConserv = 0, + PiecewiseLinearConservMinMax, + PiecewiseConstant + }; + const std::map str2int = { + {"pwlinearconserv", PiecewiseLinearConserv}, + {"pwlinearconservminmax", PiecewiseLinearConservMinMax}, + {"pwconstant", PiecewiseConstant}, + {"default", PiecewiseLinearConserv} + }; + const amrex::Array searchKey{"regrid_interp_method"}; +}; +const Interpolator interpolator; + +/** + * \brief struct holding PeleLMeX LES SGS model options + default is None + */ +struct LESModel { + enum { + None = 0, + Smagorinsky, + WALE, + Sigma + }; + const std::map str2int = { + {"none", None}, + {"smagorinsky", Smagorinsky}, + {"wale", WALE}, + {"sigma", Sigma}, + {"default", None} + }; + const amrex::Array searchKey{"les_model"}; +}; +const LESModel lesmodel; + +/** + * \brief struct holding PeleLMeX advection scheme options + default is Godunov_PLM + */ +struct AdvectionScheme { + enum { + Godunov_PLM = 0, + Godunov_PPM, + Godunov_PPM_WENOZ, + Godunov_PPM_NOLIM, + Godunov_BDS + }; + const std::map str2int = { + {"godunov_plm", Godunov_PLM}, + {"godunov_ppm", Godunov_PPM}, + {"godunov_ppm_wenoz", Godunov_PPM_WENOZ}, + {"godunov_ppm_nolim", Godunov_PPM_NOLIM}, + {"godunov_bds", Godunov_BDS}, + {"default", Godunov_PLM} + }; + const amrex::Array searchKey{"advection_scheme"}; +}; +const AdvectionScheme advscheme; + +/** + * \brief struct holding PeleLMeX load balancing cost options + default is Ncell + */ +struct LoadBalanceCost { + enum { + Ncell = 0, + ChemFunctCallAvg, + ChemFunctCallMax, + ChemFunctCallSum, + UserDefinedDerivedAvg, + UserDefinedDerivedSum, + Timers + }; + const std::map str2int = { + {"ncell", Ncell}, + {"chemfunctcall_avg", ChemFunctCallAvg}, + {"chemfunctcall_max", ChemFunctCallMax}, + {"chemfunctcall_sum", ChemFunctCallSum}, + {"userdefined_avg", UserDefinedDerivedAvg}, + {"userdefined_sum", UserDefinedDerivedSum}, + {"default", Ncell} + }; + const amrex::Array searchKey{"load_balancing_cost_estimate", + "chem_load_balancing_cost_estimate"}; +}; +const LoadBalanceCost lbcost; + +/** + * \brief struct holding PeleLMeX load balancing method options + default is SFC + */ +struct LoadBalanceMethod { + enum { + SFC = 0, + Knapsack, + RoundRobin + }; + const std::map str2int = { + {"sfc", SFC}, + {"knapsack", Knapsack}, + {"roundrobin", RoundRobin}, + {"default", SFC} + }; + const amrex::Array searchKey{"load_balancing_method", + "chem_load_balancing_method"}; +}; +const LoadBalanceMethod lbmethod; +#endif diff --git a/Source/PeleLMUtils.cpp b/Source/PeleLMUtils.cpp index c7b88c68..c0bf3545 100644 --- a/Source/PeleLMUtils.cpp +++ b/Source/PeleLMUtils.cpp @@ -602,6 +602,7 @@ void PeleLM::resetCoveredMask() if (m_verbose) Print() << " Resetting fine-covered cells mask \n"; for (int lev = 0; lev < finest_level; ++lev) { + // Set a fine-covered mask BoxArray baf = grids[lev+1]; baf.coarsen(ref_ratio[lev]); m_coveredMask[lev]->setVal(1); @@ -653,21 +654,35 @@ void PeleLM::resetCoveredMask() } m_baChem[lev].reset(new BoxArray(std::move(bl))); m_dmapChem[lev].reset(new DistributionMapping(*m_baChem[lev])); + + // Load balancing of the chemistry DMap + if (m_doLoadBalance) { + loadBalanceChemLev(lev); + } } // Set a BoxArray for the chemistry on the finest level too m_baChem[finest_level].reset(new BoxArray(grids[finest_level])); if ( m_max_grid_size_chem.min() > 0 ) { - m_baChem[finest_level]->maxSize(m_max_grid_size_chem); + m_baChem[finest_level]->maxSize(m_max_grid_size_chem); } m_baChemFlag[finest_level].resize(m_baChem[finest_level]->size()); std::fill(m_baChemFlag[finest_level].begin(), m_baChemFlag[finest_level].end(), 1); m_dmapChem[finest_level].reset(new DistributionMapping(*m_baChem[finest_level])); + if (m_doLoadBalance && m_max_grid_size_chem.min() > 0) { + loadBalanceChemLev(finest_level); + } + // Switch off trigger m_resetCoveredMask = 0; - } + } else { + // Just load balance the chem. distribution map + if (m_doLoadBalance) { + loadBalanceChem(); + } + } //---------------------------------------------------------------------------- // Need to compute the uncovered volume @@ -680,9 +695,84 @@ void PeleLM::resetCoveredMask() } m_uncoveredVol = MFSum(GetVecOfConstPtrs(dummy),0); } +} + +void +PeleLM::loadBalanceChem() { + + for (int lev = 0; lev <= finest_level; ++lev) { + // Finest grid uses AmrCore DM unless different max grid size specified. + // Keep the AmrCore DM. + if ( lev == finest_level && m_max_grid_size_chem.min() < 0 ) { + continue; + } + loadBalanceChemLev(lev); + } +} + +void +PeleLM::loadBalanceChemLev(int a_lev) { + + LayoutData new_cost(*m_baChem[a_lev],*m_dmapChem[a_lev]); + computeCosts(a_lev, new_cost, m_loadBalanceCostChem); + + // Use efficiency: average MPI rank cost / max cost + amrex::Real currentEfficiency = 0.0; + amrex::Real testEfficiency = 0.0; + + DistributionMapping test_dmap; + // Build the test dmap, w/o braodcasting + if (m_loadBalanceMethodChem == LoadBalanceMethod::SFC) { + + test_dmap = DistributionMapping::makeSFC(new_cost, + currentEfficiency, testEfficiency, + false, + ParallelDescriptor::IOProcessorNumber()); + + } else if (m_loadBalanceMethodChem == LoadBalanceMethod::Knapsack) { + + const amrex::Real navg = static_cast(m_baChem[a_lev]->size()) / + static_cast(ParallelDescriptor::NProcs()); + const int nmax = static_cast(std::max(std::round(m_loadBalanceKSfactor*navg), std::ceil(navg))); + test_dmap = DistributionMapping::makeKnapSack(new_cost, + currentEfficiency, testEfficiency, + nmax, + false, + ParallelDescriptor::IOProcessorNumber()); + } + // IO proc determine if the test dmap offers significant improvements + int updateDmap = false; + if ((m_loadBalanceEffRatioThreshold > 0.0) + && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())) + { + updateDmap = testEfficiency > m_loadBalanceEffRatioThreshold*currentEfficiency; + } + ParallelDescriptor::Bcast(&updateDmap, 1, ParallelDescriptor::IOProcessorNumber()); + + if ( m_verbose > 2 && updateDmap) { + Print() << " Old Chem LoadBalancing efficiency on a_lev " << a_lev << ": " << currentEfficiency << "\n" + << " New Chem LoadBalancing efficiency: " << testEfficiency << " \n"; + } + + // Bcast the test dmap if better + if (updateDmap) { + Vector pmap; + if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) { + pmap = test_dmap.ProcessorMap(); + } else { + pmap.resize(static_cast(m_baChem[a_lev]->size())); + } + ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber()); + + if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) { + test_dmap = DistributionMapping(pmap); + } + m_dmapChem[a_lev].reset(new DistributionMapping(test_dmap)); + } } + // Return a unique_ptr with the entire derive std::unique_ptr PeleLM::derive(const std::string &a_name, diff --git a/Submodules/AMReX-Hydro b/Submodules/AMReX-Hydro index 38ad3080..7aa8b657 160000 --- a/Submodules/AMReX-Hydro +++ b/Submodules/AMReX-Hydro @@ -1 +1 @@ -Subproject commit 38ad3080464e4bfaf8c3c5a17aa7df0e80d85f8d +Subproject commit 7aa8b65782d0309fb5b84926f906f306edfa4509 diff --git a/Submodules/PelePhysics b/Submodules/PelePhysics index 2108b3ef..29a49c38 160000 --- a/Submodules/PelePhysics +++ b/Submodules/PelePhysics @@ -1 +1 @@ -Subproject commit 2108b3efda4d5fbfc2e6e46e0982d70a968a0191 +Subproject commit 29a49c38a68eb39146ca8489c599e0a0d2ea49ad diff --git a/Submodules/amrex b/Submodules/amrex index bacaa10a..b0a03791 160000 --- a/Submodules/amrex +++ b/Submodules/amrex @@ -1 +1 @@ -Subproject commit bacaa10a76366ed40b90b422cf5c7da43565281a +Subproject commit b0a037918f78a5e9f32988c4b0016093929954e7