In [1]:
import sys
sys.path.append("/lus/cls01029/spartee/poseidon/smart-sim/")

# Improving MOM6 Simulations with Machine Learning using SmartSim
 - In this notebook we demonstrate how to run Modular Ocean Model 6 (MOM6) with our the machine learned EKE parameterization
 - We show how to use SmartSim to launch the machine learning infrastructure so that the Fortran-based MOM6 and a Pytorch Model can communicate during the simulation.
 - The Pytorch Model is trained on 1/10 degree MOM6 output data and queried by a 1/4 degree MOM6 simulation to provide the value of Eddy Kinetic Energy in each subdomain of the model every timestep (6 hours).

In [2]:
# import SmartSim and the Slurm interface for obtaining allocations.
from smartsim import Experiment, slurm
from smartsim.utils.log import log_to_file

In [3]:
# import some libraries for online data analysis
import os
import time
import numpy as np
import xarray as xr
from os import environ
import matplotlib.pyplot as plt

## 1.1) Create the SmartSim Experiment
 - The SmartSim library provides the Experiment object as the main API for the user.
 - The Experiment API is used for starting, monitoring, tracking, and stopping jobs created by the user.
 - Each experiment has a specific "launcher" which communicates with the WLM of the host machine to launch jobs.
    - Launcher options
       - PBS
       - Slurm
       - Local
       - Kubernetes (soon)
       - Capsules (soon)

In [4]:
# Create the SmartSim Experiment. 
experiment = Experiment("AI-EKE-MOM6", launcher="slurm")

## 1.2) Obtain Computational Resource Allocations 
   - When launching on Slurm, users can obtain an allocation and tell the SmartSim Experiment where to launch their applications
   - Here we obtain two allocations. 
      1. An allocation for MOM6 on 64 Skylake compute nodes with 56 tasks running per node
      2. An allocation for the orchestrator database on 15 Broadwell CPUs with 1 P100 GPU per node.

In [None]:

# resource allocation settings
database_nodes = 16
database_alloc_opts = {
    "constraint": "P100",
    "cpus-per-task": 36,
    "ntasks-per-node": 1,
    "exclusive": None,
    "time": "10-00:00:00"
}

mom6_compute_nodes = 74
mom6_total_tasks = 3551
mom6_alloc_opts = {
    "constraint": "SK48",
    "ntasks-per-node": 48,
    "exclusive": None,
    "time": "10-00:00:00"
}

# obtain allocations for the model and database through SmartSim
mom6_alloc = slurm.get_slurm_allocation(nodes=mom6_compute_nodes,
                                        add_opts=mom6_alloc_opts)
db_alloc = slurm.get_slurm_allocation(nodes=database_nodes,
                                      add_opts=database_alloc_opts)

In [5]:
# ensemble runs
# resource allocation settings
database_nodes = 16
database_alloc_opts = {
    "constraint": "P100",
    "cpus-per-task": 36,
    "ntasks-per-node": 1,
    "exclusive": None,
    "time": "10-00:00:00"
}

ens_compute_nodes = 300
ens_total_tasks = 14400
ens_alloc_opts = {
    "constraint": '"[SK48,SK56,CL48]"',
    "ntasks-per-node": 48,
    "exclusive": None,
    "time": "10-00:00:00"
}

# obtain allocations for the model and database through SmartSim
ens_alloc = slurm.get_slurm_allocation(nodes=ens_compute_nodes,
                                        add_opts=ens_alloc_opts)
db_alloc = slurm.get_slurm_allocation(nodes=database_nodes,
                                      add_opts=database_alloc_opts)

17:35:12 horizon smartsim.launcher.slurm.slurm[1564] DEBUG Allocation settings: -N 300 -J SmartSim --constraint=[SK48,SK56,CL48] --ntasks-per-node=48 --exclusive --time=10-00:00:00
17:35:12 horizon smartsim.launcher.slurm.slurm[1564] DEBUG salloc: error: Job submit/allocate failed: Requested node configuration is not available



LauncherError: Job submit/allocate failed: Requested node configuration is not available

## 1.3) Setup MOM6 and Inference Database in SmartSim

In [None]:
# create the database object
orc = experiment.create_orchestrator(db_nodes=database_nodes,
                                     overwrite=True,
                                     alloc=db_alloc)

# Create the MOM6 model object
mom6_settings = {
    "nodes": mom6_compute_nodes,
    "ntasks": mom6_total_tasks,
    "executable":  "/lus/cls01029/ashao/dev/MOM6-examples/build/gnu/ice_ocean_SIS2/repro/MOM6",
    "alloc": mom6_alloc,
    "exclusive": None
}
mom6 = experiment.create_model("mom6_model", run_settings=mom6_settings)


In [None]:
num_ensembles = 6
ensemble = experiment.create_ensemble("MOM6-ensemble", run_settings=run_settings)
for i in range(num_ensembles):
    _model = exp.create_model(f"MOM6_{str(i)}", run_settings=run_settings)
    _model.register_incoming_connection(_model, "fortran")
    ensemble.add_model(_model)

In [None]:
# setup the input/configuration files for the MOM6 simulation
mom6.attach_generator_files(
    to_copy=["/lus/cls01029/spartee/MOM/OM4_025"],
    to_symlink=['/lus/cls01029/ashao/dev/poseidon-smartsim/examples/MOM6/ml_eke/INPUT']
)
experiment.generate(mom6, orc, overwrite=True)

In [None]:
Affinity/Multi-core options: (when the task/affinity plugin is enabled)
  -B, --extra-node-info=S[:C[:T]]           Expands to:
      --sockets-per-node=S    number of sockets per node to allocate
      --cores-per-socket=C    number of cores per socket to allocate
      --threads-per-core=T    number of threads per core to allocate
                              each field can be 'min' or wildcard '*'
                              total cpus requested = (N x S x C x T)

      --ntasks-per-core=n     number of tasks to invoke on each core
      --ntasks-per-socket=n   number of tasks to invoke on each socket
      --cpu-bind=             Bind tasks to CPUs
                              (see "--cpu-bind=help" for options)
      --hint=                 Bind tasks according to application hints
                              (see "--hint=help" for options)
      --mem-bind=             Bind memory to locality domains (ldom)
                              (see "--mem-bind=help" for options)

Options provided by plugins:
      --image=image           shifter image to use
      --volume=volume         shifter image bindings

GPU scheduling options:
      --cpus-per-gpu=n        number of CPUs required per allocated GPU
  -G, --gpus=n                count of GPUs required for the job
      --gpu-bind=...          task to gpu binding options
      --gpu-freq=...          frequency and voltage of GPUs
      --gpus-per-node=n       number of GPUs required per allocated node
      --gpus-per-socket=n     number of GPUs required per allocated socket
      --gpus-per-task=n       number of GPUs required per spawned task
      --mem-per-gpu=n         real memory required per allocated GPU

        CPU bind options:
    --cpu-bind=         Bind tasks to CPUs
        q[uiet]         quietly bind before task runs (default)
        v[erbose]       verbosely report binding before task runs
        no[ne]          don't bind tasks to CPUs (default)
        rank            bind by task rank
        map_cpu:<list>  specify a CPU ID binding for each task
                        where <list> is <cpuid1>,<cpuid2>,...<cpuidN>
        mask_cpu:<list> specify a CPU ID binding mask for each task
                        where <list> is <mask1>,<mask2>,...<maskN>
        rank_ldom       bind task by rank to CPUs in a NUMA locality domain
        map_ldom:<list> specify a NUMA locality domain ID for each task
                        where <list> is <ldom1>,<ldom2>,...<ldomN>
        mask_ldom:<list>specify a NUMA locality domain ID mask for each task
                        where <list> is <mask1>,<mask2>,...<maskN>
        sockets         auto-generated masks bind to sockets
        cores           auto-generated masks bind to cores
        threads         auto-generated masks bind to threads
        ldoms           auto-generated masks bind to NUMA locality domains
        boards          auto-generated masks bind to boards
        help            show this help message

## 1.4) Start the Experiment
  - To start the SmartSim experiment, we simply provide our MOM6 model and orchestrator objects to the start function.
  - If Block is set to true, the start method is blocking and further code will not be executed.
  - If Block is false, users will be able to interact with their models in real time.

In [None]:
# Start the model and orchestrator
experiment.start(mom6, block=False, summary=True)

In [None]:
print(f"MOM6 Status: {experiment.get_status(mom6)[0]}")

In [None]:
experiment.stop(mom6)

# 2.0) Interactive, Online Analysis of MOM6 with SmartSim/SILC
  - SmartSim allows simulation users to interact with and view their data being produced in real-time using SILC.
  - Fortran, C, and C++ arrays are converted directly into Python NumPy arrays. 
  - SILC handles all array trasformations, memory management, and row/column major transformations between langauges.

In [None]:
!salloc --help | grep per-task

In [None]:
# import SILC python client to retrieve FORTRAN data.
from silc import Client
import reconstruct

## 2.1) Initialize the SILC Python Client

In [None]:
client = Client("nid00196:6379", True, True)

In [None]:
script = client.get_script("preeke")

In [None]:
script

## 2.2) Retrieve and Visualize MKE and EKE during the Simulation
   - Here we **show the plots of MKE and EKE while MOM6 is running**
   - The data is sent from MOM6, stored in memory, and retrieved by the SILC Python client. 
   - We use a simple Python script to reconstruct the domain (reconstruct.py)
   - This approach **completely bypasses the file-system** except for the ocean grid which doesn't change.

In [None]:
grid = xr.open_dataset('/home/users/ashao/lustre/dev/poseidon-smartsim/examples/MOM6/ml_eke/year1/mom6_model/sea_ice_geometry.nc')

plt.figure(figsize=(20, 8))
plt.subplot(1,2,1)
mke = reconstruct.reconstruct_domain(client, "MKE_59918400900.",3551).transpose()
mke[grid.wet==0] = np.nan
plt.pcolormesh(mke,vmin=0,vmax=.2)
plt.colorbar(label='Mean Kinetic Energy (m$^2$s$^{-2}$)')
plt.title("Mean Kinetic Energy streamed out of MOM6")

plt.subplot(1,2,2)
mke = reconstruct.reconstruct_domain(client,"EKE_59918400900.",3551).transpose()
mke[grid.wet==0] = np.nan
plt.pcolormesh(mke,vmin=0,vmax=.1)
plt.colorbar(label='Eddy Kinetic Energy (m$^2$s$^{-2}$)')
plt.title("Eddy Kinetic Energy streamed out of MOM6")

In [None]:
plt.figure(figsize=(20, 8))
plt.subplot(1,2,1)
mke = reconstruct.reconstruct_domain(client,"MKE_59918479200.",3551).transpose()
mke[grid.wet==0] = np.nan
plt.pcolormesh(mke,vmin=0,vmax=.2)
plt.colorbar(label='Mean Kinetic Energy (m$^2$s$^{-2}$)')
plt.title("Mean Kinetic Energy streamed out of MOM6")

plt.subplot(1,2,2)
mke = reconstruct.reconstruct_domain(client,"EKE_59918479200.",3551).transpose()
mke[grid.wet==0] = np.nan
plt.pcolormesh(mke,vmin=0,vmax=.1)
plt.colorbar(label='Eddy Kinetic Energy (m$^2$s$^{-2}$)')
plt.title("Eddy Kinetic Energy streamed out of MOM6")

In [None]:
plt.figure(figsize=(20, 8))
plt.subplot(1,2,1)
mke = reconstruct.reconstruct_domain(client,"MKE_59918408100.",3551).transpose()
mke[grid.wet==0] = np.nan
plt.pcolormesh(mke,vmin=0,vmax=.2)
plt.colorbar(label='Mean Kinetic Energy (m$^2$s$^{-2}$)')
plt.title("Mean Kinetic Energy streamed out of MOM6")

plt.subplot(1,2,2)
mke = reconstruct.reconstruct_domain(client,"EKE_59918408100.",3551).transpose()
mke[grid.wet==0] = np.nan
plt.pcolormesh(mke,vmin=0,vmax=.1)
plt.colorbar(label='Eddy Kinetic Energy (m$^2$s$^{-2}$)')
plt.title("Eddy Kinetic Energy streamed out of MOM6")

## 2.3) Release Computational Resources for the SmartSim Experiment

In [None]:
# stop the database
#experiment.stop(orc)

# Release our system compute allocation for MOM6 and the database
slurm.release_slurm_allocation(db_alloc)
slurm.release_slurm_allocation(mom6_alloc)

In [None]:
experiment.summary()

In [None]:
experiment.stop(orc)

In [None]:
!squeue

In [None]:
!sinfo

In [None]:
!sacct -u spartee