From 47503eb81950b778660c3d9deea3e7df0773ec68 Mon Sep 17 00:00:00 2001 From: Weiyuan Jiang Date: Fri, 12 Jan 2024 10:01:41 -0500 Subject: [PATCH 1/9] added openmpi flags to make it run faster on SLES15 ( it doesn't hurt on SLES12) --- src/Applications/LDAS_App/lenkf.j.template | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Applications/LDAS_App/lenkf.j.template b/src/Applications/LDAS_App/lenkf.j.template index 11a1ddfb..4e478f8b 100644 --- a/src/Applications/LDAS_App/lenkf.j.template +++ b/src/Applications/LDAS_App/lenkf.j.template @@ -40,7 +40,23 @@ setenv argv source $GEOSBIN/g5_modules -setenv I_MPI_DAPL_UD enable +# OPENMPI flags +# Turn off warning about TMPDIR on NFS +setenv OMPI_MCA_shmem_mmap_enable_nfs_warning 0 +# pre-connect MPI procs on mpi_init +setenv OMPI_MCA_mpi_preconnect_all 1 +setenv OMPI_MCA_coll_tuned_bcast_algorithm 7 +setenv OMPI_MCA_coll_tuned_scatter_algorithm 2 +setenv OMPI_MCA_coll_tuned_reduce_scatter_algorithm 3 +setenv OMPI_MCA_coll_tuned_allreduce_algorithm 3 +setenv OMPI_MCA_coll_tuned_allgather_algorithm 4 +setenv OMPI_MCA_coll_tuned_allgatherv_algorithm 3 +setenv OMPI_MCA_coll_tuned_gather_algorithm 1 +setenv OMPI_MCA_coll_tuned_barrier_algorithm 0 +# required for a tuned flag to be effective +setenv OMPI_MCA_coll_tuned_use_dynamic_rules 1 +# disable file locks +setenv OMPI_MCA_sharedfp "^lockedfile,individual" # By default, ensure 0-diff across processor architecture by limiting MKL's freedom to pick algorithms. # As of June 2021, MKL_CBWR=AVX2 is fastest setting that works for both haswell and skylake at NCCS. From 0b981471ce0dfd01edfb76986c6772955f93eb7e Mon Sep 17 00:00:00 2001 From: Weiyuan Jiang Date: Fri, 12 Jan 2024 11:33:32 -0500 Subject: [PATCH 2/9] submit job to where it is built --- src/Applications/LDAS_App/CMakeLists.txt | 5 ++++- src/Applications/LDAS_App/ldas_setup | 15 +++++++++++++-- src/Applications/LDAS_App/lenkf.j.template | 2 ++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/Applications/LDAS_App/CMakeLists.txt b/src/Applications/LDAS_App/CMakeLists.txt index f2f5ed36..b3b5a4a1 100644 --- a/src/Applications/LDAS_App/CMakeLists.txt +++ b/src/Applications/LDAS_App/CMakeLists.txt @@ -20,7 +20,6 @@ ecbuild_add_executable ( LIBS GEOSlandassim_GridComp) set (scripts - ldas_setup process_hist.csh process_rst.py ens_forcing/average_ensemble_forcing.py @@ -35,6 +34,10 @@ install ( DESTINATION bin ) +set(file ldas_setup) +configure_file(${file} ${file} @ONLY) +install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${file} DESTINATION bin) + file(GLOB rc_files GEOSldas_*rc) file(GLOB nml_files LDASsa_DEFAULT*nml) diff --git a/src/Applications/LDAS_App/ldas_setup b/src/Applications/LDAS_App/ldas_setup index f8860681..e15b8105 100755 --- a/src/Applications/LDAS_App/ldas_setup +++ b/src/Applications/LDAS_App/ldas_setup @@ -51,6 +51,13 @@ class LDASsetup: 'MINLON','MAXLON','MINLAT','MAXLAT','EXCLUDE_FILE','INCLUDE_FILE','MWRTM_PATH','GRIDNAME', 'ADAS_EXPDIR', 'BCS_RESOLUTION' ] + # if build on sles15, BUILT_ON_SLES15 is "NOYES" + BUILT_ON_SLES15 = "NO@BUILT_ON_SLES15@" + + if BUILT_ON_SLES15== "NO": + slef.BUILT_ON_SLES15 = False + else: + self.BUILT_ON_SLES15 = True # ------ # Required resource manager input fields @@ -1365,8 +1372,12 @@ class LDASsetup: elif 'MY_ADAS_EXPDIR' in line : if self.ladas_coupling > 0: fout.write(line.replace('MY_ADAS_EXPDIR', self.rqdExeInp['ADAS_EXPDIR'])) - - + elif 'MY_CONSTRAINT' in line: + if self.BUILD_ON_SLES15 : + fout.write(line.replace('MY_CONSTRAINT', 'mil')) + else: + fout.write(line.replace('MY_CONSTRAINT', '"[cas|sky]"')) + else : fout.write(line.replace('MY_EXPDIR',self.exphome+'/$EXPID')) diff --git a/src/Applications/LDAS_App/lenkf.j.template b/src/Applications/LDAS_App/lenkf.j.template index 4e478f8b..aed32135 100644 --- a/src/Applications/LDAS_App/lenkf.j.template +++ b/src/Applications/LDAS_App/lenkf.j.template @@ -15,6 +15,8 @@ #SBATCH --nodes=MY_NODES --ntasks-per-node=MY_NTASKS_PER_NODE #SBATCH --job-name=MY_JOB #SBATCH --qos=MY_QOS +#SBATCH --constraint=MY_CONSTRAINT + ####################################################################### # System Settings and Architecture Specific Environment Variables From 78f923b1f5f9b42a497edf71bedea4107bdf1b5b Mon Sep 17 00:00:00 2001 From: Weiyuan Jiang Date: Fri, 12 Jan 2024 14:17:35 -0500 Subject: [PATCH 3/9] typo fix --- src/Applications/LDAS_App/ldas_setup | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Applications/LDAS_App/ldas_setup b/src/Applications/LDAS_App/ldas_setup index e15b8105..f6f3b9a9 100755 --- a/src/Applications/LDAS_App/ldas_setup +++ b/src/Applications/LDAS_App/ldas_setup @@ -51,13 +51,13 @@ class LDASsetup: 'MINLON','MAXLON','MINLAT','MAXLAT','EXCLUDE_FILE','INCLUDE_FILE','MWRTM_PATH','GRIDNAME', 'ADAS_EXPDIR', 'BCS_RESOLUTION' ] - # if build on sles15, BUILT_ON_SLES15 is "NOYES" - BUILT_ON_SLES15 = "NO@BUILT_ON_SLES15@" + # if build on sles15, BUILT_ON_SLES15 is "TRUE", else emply "" + BUILT_ON_SLES15 = "@BUILT_ON_SLES15@" - if BUILT_ON_SLES15== "NO": - slef.BUILT_ON_SLES15 = False + if BUILT_ON_SLES15 == "TRUE": + slef.BUILT_ON_SLES15 = True else: - self.BUILT_ON_SLES15 = True + self.BUILT_ON_SLES15 = False # ------ # Required resource manager input fields @@ -1373,7 +1373,7 @@ class LDASsetup: if self.ladas_coupling > 0: fout.write(line.replace('MY_ADAS_EXPDIR', self.rqdExeInp['ADAS_EXPDIR'])) elif 'MY_CONSTRAINT' in line: - if self.BUILD_ON_SLES15 : + if self.BUILT_ON_SLES15 : fout.write(line.replace('MY_CONSTRAINT', 'mil')) else: fout.write(line.replace('MY_CONSTRAINT', '"[cas|sky]"')) From fcbcb2e90b00dd711c78f95ba091a91202606f9e Mon Sep 17 00:00:00 2001 From: Weiyuan Jiang Date: Fri, 12 Jan 2024 14:44:20 -0500 Subject: [PATCH 4/9] more typo fix --- src/Applications/LDAS_App/ldas_setup | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Applications/LDAS_App/ldas_setup b/src/Applications/LDAS_App/ldas_setup index f6f3b9a9..c39bbffb 100755 --- a/src/Applications/LDAS_App/ldas_setup +++ b/src/Applications/LDAS_App/ldas_setup @@ -55,7 +55,7 @@ class LDASsetup: BUILT_ON_SLES15 = "@BUILT_ON_SLES15@" if BUILT_ON_SLES15 == "TRUE": - slef.BUILT_ON_SLES15 = True + self.BUILT_ON_SLES15 = True else: self.BUILT_ON_SLES15 = False @@ -700,7 +700,11 @@ class LDASsetup: print ('\nCorrect the tile file if it is an old EASE tile format... \n') EASEtile=self.bcsdir+'/MAPL_'+short_tile cmd = './preprocess_ldas.x correctease '+ tile + ' '+ EASEtile - print ("cmd: " + cmd) + if self.BUILT_ON_SLES15 : + print ("Executables are built on SLES15 and should be run on SLES15: " + cmd) + else: + print ("cmd: " + cmd) + sp.call(shlex.split(cmd)) if os.path.isfile(EASEtile) : From 8c28f2a8c84fffa97421d6f8112a75c28b0e1453 Mon Sep 17 00:00:00 2001 From: Weiyuan Jiang Date: Wed, 17 Jan 2024 14:12:17 -0500 Subject: [PATCH 5/9] add constraint based on BUILT_ON_SLES15 --- src/Applications/LDAS_App/ldas_setup | 15 +++++++-------- src/Applications/LDAS_App/lenkf.j.template | 2 -- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/Applications/LDAS_App/ldas_setup b/src/Applications/LDAS_App/ldas_setup index c39bbffb..e4262a9d 100755 --- a/src/Applications/LDAS_App/ldas_setup +++ b/src/Applications/LDAS_App/ldas_setup @@ -1344,8 +1344,13 @@ class LDASsetup: elif 'MY_NODES' in line : line_ = line.replace('MY_NODES',str(self.optRmInp['nodes'])) fout.write(line_.replace('MY_NTASKS_PER_NODE',str(self.rqdRmInp['ntasks-per-node']))) - if int(self.rqdRmInp['ntasks-per-node']) > 40: - fout.write("#SBATCH --constraint=cas\n") + + if self.BUILT_ON_SLES15 : + fout.write("#SBATCH --constraint=mil\n") + else: + assert int(self.rqdRmInp['ntasks-per-node']) <= 46, 'ntasks-per-node should be smaller than 46 for cas' + fout.write("#SBATCH --constraint=cas\n") + elif 'MY_OSERVER_NODES' in line : fout.write(line.replace('MY_OSERVER_NODES',str(self.optRmInp['oserver_nodes']))) elif 'MY_WRITERS_NPES' in line : @@ -1376,12 +1381,6 @@ class LDASsetup: elif 'MY_ADAS_EXPDIR' in line : if self.ladas_coupling > 0: fout.write(line.replace('MY_ADAS_EXPDIR', self.rqdExeInp['ADAS_EXPDIR'])) - elif 'MY_CONSTRAINT' in line: - if self.BUILT_ON_SLES15 : - fout.write(line.replace('MY_CONSTRAINT', 'mil')) - else: - fout.write(line.replace('MY_CONSTRAINT', '"[cas|sky]"')) - else : fout.write(line.replace('MY_EXPDIR',self.exphome+'/$EXPID')) diff --git a/src/Applications/LDAS_App/lenkf.j.template b/src/Applications/LDAS_App/lenkf.j.template index aed32135..4e478f8b 100644 --- a/src/Applications/LDAS_App/lenkf.j.template +++ b/src/Applications/LDAS_App/lenkf.j.template @@ -15,8 +15,6 @@ #SBATCH --nodes=MY_NODES --ntasks-per-node=MY_NTASKS_PER_NODE #SBATCH --job-name=MY_JOB #SBATCH --qos=MY_QOS -#SBATCH --constraint=MY_CONSTRAINT - ####################################################################### # System Settings and Architecture Specific Environment Variables From c1e0cc50dcc7c0d62452e166a379c938baea72be Mon Sep 17 00:00:00 2001 From: Rolf Reichle <54944691+gmao-rreichle@users.noreply.github.com> Date: Wed, 17 Jan 2024 14:39:06 -0500 Subject: [PATCH 6/9] minor edits of SLES15 comments (ldas_setup) --- src/Applications/LDAS_App/ldas_setup | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Applications/LDAS_App/ldas_setup b/src/Applications/LDAS_App/ldas_setup index e4262a9d..12a72d27 100755 --- a/src/Applications/LDAS_App/ldas_setup +++ b/src/Applications/LDAS_App/ldas_setup @@ -51,7 +51,7 @@ class LDASsetup: 'MINLON','MAXLON','MINLAT','MAXLAT','EXCLUDE_FILE','INCLUDE_FILE','MWRTM_PATH','GRIDNAME', 'ADAS_EXPDIR', 'BCS_RESOLUTION' ] - # if build on sles15, BUILT_ON_SLES15 is "TRUE", else emply "" + # if built on sles15, BUILT_ON_SLES15 is "TRUE", else empty "" BUILT_ON_SLES15 = "@BUILT_ON_SLES15@" if BUILT_ON_SLES15 == "TRUE": @@ -701,7 +701,7 @@ class LDASsetup: EASEtile=self.bcsdir+'/MAPL_'+short_tile cmd = './preprocess_ldas.x correctease '+ tile + ' '+ EASEtile if self.BUILT_ON_SLES15 : - print ("Executables are built on SLES15 and should be run on SLES15: " + cmd) + print ("Executables were built on SLES15 and must be run on SLES15: " + cmd) else: print ("cmd: " + cmd) @@ -1348,7 +1348,7 @@ class LDASsetup: if self.BUILT_ON_SLES15 : fout.write("#SBATCH --constraint=mil\n") else: - assert int(self.rqdRmInp['ntasks-per-node']) <= 46, 'ntasks-per-node should be smaller than 46 for cas' + assert int(self.rqdRmInp['ntasks-per-node']) <= 46, 'ntasks-per-node should be <=46 for cas' fout.write("#SBATCH --constraint=cas\n") elif 'MY_OSERVER_NODES' in line : From 87079d78aa151cf916d4141b8451cd7f048fff85 Mon Sep 17 00:00:00 2001 From: Rolf Reichle Date: Sat, 20 Jan 2024 13:06:05 -0500 Subject: [PATCH 7/9] updated README.md for use of Milan nodes (SLES15) at NCCS --- README.md | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 5b81ce44..84ca1cf8 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,20 @@ module use -a (path) module load GEOSenv ``` -where `(path)` depends on the computer and operating system: +where `(path)` depends on the computing system; at NCCS, `(path)` also depends on the operating system (SLES12 on Skylake and Cascade Lake nodes; SLES15 on Milan nodes, as of Jan. 2024): | System | Path | | ------------- |---------------------------------------------------| -| NCCS | `/discover/swdev/gmao_SIteam/modulefiles-SLES12` | +| NCCS Discover | `/discover/swdev/gmao_SIteam/modulefiles-SLES12` | +| | `/discover/swdev/gmao_SIteam/modulefiles-SLES15` | | NAS | `/nobackup/gmao_SIteam/modulefiles` | | GMAO desktops | `/ford1/share/gmao_SIteam/modulefiles` | +Step 1 can be coded into the user's shell configuration file (e.g., `.bashrc` or `.cshrc`). See the [GEOSgcm Wiki](https://github.com/GEOS-ESM/GEOSgcm/wiki/) for sample shell configuration files. ### Step 2: Obtain the Model -For development work, clone the _entire_ repository and use the `develop` branch as your starting point (equivalent to the `UNSTABLE` tag in the old CVS repository): +For development work, clone the _entire_ repository and use the `develop` branch as your starting point: ``` git clone -b develop git@github.com:GEOS-ESM/GEOSldas.git ``` @@ -36,25 +38,32 @@ git clone -b v17.9.1 --single-branch git@github.com:GEOS-ESM/GEOSldas.git ### Step 3: Build the Model -To build the model in a single step, do the following: +To build the model in a single step, do the following from a head node: ``` cd ./GEOSldas parallel_build.csh ``` -from a head node. Doing so will check out all the external repositories of the model (albeit only on the first run, [see subsection on mepo below](#mepo)!) and build the model. When done, the resulting model build will be found in `build-SLES12/` and the installation will be found in `install-SLES12/`, with setup scripts like `ldas_setup` in `install-SLES12/bin`. +This checks out all the external repositories of the model (albeit only on the first run, [see subsection on mepo below](#mepo)!) and then builds and installs the model. -To obtain a build that is suitable for debugging, use `parallel_build.csh -debug`, which will build in `build-Debug-SLES12/` and install in `install-Debug-SLES12/`. There is also an option for aggressive optimization. For details, see [GEOSldas Wiki](https://github.com/GEOS-ESM/GEOSldas/wiki). +At **NCCS**, the default is to build GEOSldas on SLES12 (Skylake or Cascade Lake nodes); to build GEOSldas on SLES15 (Milan nodes), use `parallel_build.csh -mil`. -See below for how to build the model in multiple steps. +The resulting model build is found in `build[-SLESxx]/`, and the installation is found in `install[-SLESxx]/`, with setup scripts like `ldas_setup` in `install[-SLESxx]/bin`. + +To obtain a build that is suitable for debugging, use `parallel_build.csh -debug`, which builds in `build-Debug[-SLESxx]/` and installs in `install-Debug[-SLESxx]/`. There is also an option for aggressive optimization. For details, see the [GEOSldas Wiki](https://github.com/GEOS-ESM/GEOSldas/wiki). + +Instructions for building the model in multiple steps are provided below. --- ## How to Set Up (Configure) and Run GEOSldas -a) Set up the job as follows: + +a) At **NCCS**, GEOSldas must be built, configured, and run on the same operating system. To run GEOSldas on Milan nodes (SLES15), start with `ssh discover-mil`. + +b) Set up the job as follows: ``` -cd (build_path)/GEOSldas/install/bin +cd (build_path)/GEOSldas/install[-SLESxx]/bin source g5_modules [for bash or zsh: source g5_modules.[z]sh] ./ldas_setup setup [-v] (exp_path) ("exe"_input_filename) ("bat"_input_filename) ``` @@ -82,7 +91,7 @@ Edit these sample files following the examples and comments within the sample fi The ldas_setup script creates a run directory and other directories at: `[exp_path]/[exp_name]` -Configuration input files will be created at: +Configuration input files are created at: `[exp_path]/[exp_name]/run` For more options and documentation, use any of the following: @@ -92,16 +101,19 @@ ldas_setup sample -h ldas_setup setup -h ``` -b) Configure the experiment output by editing the ```./run/HISTORY.rc``` file as needed. +c) Configure the experiment output by editing the ```./run/HISTORY.rc``` file as needed. -c) Run the job: +d) Run the job: ``` cd [exp_path]/[exp_name]/run/ sbatch lenkf.j ``` -For more information, see the files in `./doc/`. -Moreover, descriptions of the configuration (resource) parameters are included in the sample "exeinp" and "batinp" files that can be generated using `ldas_setup`. +At **NCCS**, the appropriate SLURM directive `#SBATCH --constraint=[xxx]` is automatically added into `lenkf.j` depending on the operating system. + +For more information, see the files in `./doc/`. Moreover, descriptions of the configuration (resource) parameters are included in the sample "exeinp" and "batinp" files that can be generated using `ldas_setup`. + + ----------------------------------------------------------------------------------- @@ -138,7 +150,6 @@ We currently do not allow in-source builds of GEOSldas. So we must make a direct ``` mkdir build ``` -The advantages of this is that you can build both a Debug and Release version with the same clone if desired. #### Run CMake CMake generates the Makefiles needed to build the model. @@ -146,7 +157,7 @@ CMake generates the Makefiles needed to build the model. cd build cmake .. -DBASEDIR=$BASEDIR/Linux -DCMAKE_Fortran_COMPILER=ifort -DCMAKE_INSTALL_PREFIX=../install ``` -This will install to a directory parallel to your `build` directory. If you prefer to install elsewhere change the path in: +This installs into a directory parallel to your `build` directory. If you prefer to install elsewhere change the path in: ``` -DCMAKE_INSTALL_PREFIX= ``` From e883a23efb4200374d570e2916c9fb5a0b466843 Mon Sep 17 00:00:00 2001 From: Biljana Orescanin <68251545+biljanaorescanin@users.noreply.github.com> Date: Wed, 31 Jan 2024 13:18:22 -0500 Subject: [PATCH 8/9] generic nco --- src/Applications/LDAS_App/lenkf.j.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Applications/LDAS_App/lenkf.j.template b/src/Applications/LDAS_App/lenkf.j.template index 4e478f8b..5113b5e4 100644 --- a/src/Applications/LDAS_App/lenkf.j.template +++ b/src/Applications/LDAS_App/lenkf.j.template @@ -70,7 +70,7 @@ setenv MKL_CBWR "AVX2" setenv LD_LIBRARY_PATH ${BASEDIR}/${ARCH}/lib:${ESMADIR}/lib:${LD_LIBRARY_PATH} if ( -e /etc/os-release ) then - module load nco/4.8.1 + module load nco else module load other/nco-4.6.8-gcc-5.3-sp3 endif From d82e82c26ab32704617369e51a20b06421f32851 Mon Sep 17 00:00:00 2001 From: Biljana Orescanin <68251545+biljanaorescanin@users.noreply.github.com> Date: Wed, 31 Jan 2024 13:26:24 -0500 Subject: [PATCH 9/9] remove obsolete if --- src/Applications/LDAS_App/lenkf.j.template | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Applications/LDAS_App/lenkf.j.template b/src/Applications/LDAS_App/lenkf.j.template index 5113b5e4..6f77c5c2 100644 --- a/src/Applications/LDAS_App/lenkf.j.template +++ b/src/Applications/LDAS_App/lenkf.j.template @@ -69,11 +69,8 @@ setenv MKL_CBWR "AVX2" # reversed sequence for LADAS_COUPLING (Sep 2020) (needed when coupling with ADAS using different BASEDIR) setenv LD_LIBRARY_PATH ${BASEDIR}/${ARCH}/lib:${ESMADIR}/lib:${LD_LIBRARY_PATH} -if ( -e /etc/os-release ) then - module load nco -else - module load other/nco-4.6.8-gcc-5.3-sp3 -endif +module load nco + setenv RUN_CMD "$GEOSBIN/esma_mpirun -np " #######################################################################