In [1]:
%load_ext autoreload
%autoreload 2

# ESM1.6 Scaling

End-to-end pipeline generating scaling plots for ESM1.6 Pre-industrial configuration.

## Setup
The following variables control where the configurations and model run outputs are saved.

In [2]:
import os
import getpass
username = getpass.getuser()
defaultproject = os.environ.get("PROJECT")

# CHANGE test_path to where you're comfortable dumping model run information
test_path = f"/scratch/{defaultproject}/{username}/access_models_scaling_results"

# Name of the experiment
model_type = "access-esm1.6"
test_path = f"{model_type}_scaling_layouts"
repository_directory = f"{model_type}-PI-config"


## Generate experiments

We will perform a simple scaling study, varying the total number of nodes used over [0.5, 1.0, 2.0, 4.0], i.e., we will run the model with 52. 104, 208 and 416 cores.

In [3]:
def get_node_configs():
    from access.config.esm1p6_layout_input import LayoutSearchConfig

    # num_nodes_list = [0.5, 1.0, 2.0, 3.0, 4.0, 4.0]
    # tol_around_ctrl_ratio = [0.05]*len(num_nodes_list)
    # tol_around_ctrl_ratio[-1] = None # For the last 4-node config, allow to solve for other layouts using the defaults
    # max_wasted_ncores_frac=[0.1 if x <= 1 else 0.05 if x <=3 else 0.02 for x in num_nodes_list]
    # scaling_configs = [LayoutSearchConfig(tol_around_ctrl_ratio=t, max_wasted_ncores_frac=m) for t, m in zip(tol_around_ctrl_ratio, max_wasted_ncores_frac)]

    num_nodes_list = [4.0]
    tol_around_ctrl_ratio = [0.05]*len(num_nodes_list)
    tol_around_ctrl_ratio[-1] = None # For the last 4-node config, allow to solve for other layouts using the defaults
    max_wasted_ncores_frac=[0.1 if x <= 1 else 0.05 if x <=3 else 0.02 for x in num_nodes_list]
    scaling_configs = [LayoutSearchConfig(tol_around_ctrl_ratio=t, max_wasted_ncores_frac=m) for t, m in zip(tol_around_ctrl_ratio, max_wasted_ncores_frac)]


    return num_nodes_list, scaling_configs


In [None]:

def generate_esm1p6_layouts() -> str:
    import sys

    from access.config.esm1p6_layout_input import generate_esm1p6_core_layouts_from_node_count, generate_esm1p6_perturb_block

    control_expriment_name = "Control_Experiment"
    generator_config_prefix = f"""
model_type: {model_type}
repository_url: git@github.com:ACCESS-NRI/{model_type}-configs.git
start_point: 1ebd393 # the commit hash that access-bot refers to when committing the new checksums

test_path: {test_path}
repository_directory: {repository_directory}

control_branch_name: ctrl

{control_expriment_name}:
    config.yaml:
        walltime: Inf
        modules:
            use:
                - /g/data/{defaultproject}/{username}/spack/0.22/release/modules/linux-rocky8-x86_64_v4
            load:
                - access-esm1p6/2025.09.002

        

        manifest:
            reproduce:
                exe: False    # cice5 has to be manually compiled because the runtime core counts is set at compile time
        repeat: True
        runspersub: 10

Perturbation_Experiment:
"""

    blocknum = 1
    queue = "normalsr"
    branch_name_prefix = "esm1p6-layout"
    entire_block = generator_config_prefix
    num_nodes_list, scaling_configs = get_node_configs()
    # num_nodes_list = [4.0]
    # from access.config.esm1p6_layout_input import LayoutSearchConfig
    # scaling_configs = [LayoutSearchConfig(tol_around_ctrl_ratio=None, max_wasted_ncores_frac=0.02)]*len(num_nodes_list)

    seen_layouts = set()
    walltime_hrs = 0.0
    for num_nodes, config in zip(num_nodes_list, scaling_configs, strict=True):
        layout = generate_esm1p6_core_layouts_from_node_count(
            num_nodes,
            queue=queue,
            layout_search_config=config,
        )[0]
        if not layout:
            print(f"No layouts found for {num_nodes} nodes", file=sys.stderr)
            continue

        layout = [x for x in layout if x not in seen_layouts]
        seen_layouts.update(layout)
        print(f"Generated {len(layout)} layouts for {num_nodes} nodes. Layouts: {layout}")

        branch_name = f"{branch_name_prefix}-unused-cores-to-cice-{config.allocate_unused_cores_to_ice}"
        prev_blocknum = blocknum
        block, blocknum = generate_esm1p6_perturb_block(
            num_nodes, layout, branch_name, queue=queue, start_blocknum=blocknum,
        )
        nblocks_added = blocknum - prev_blocknum
        walltime_hrs += nblocks_added * (1.5 * 4.0/num_nodes) # use a 1.5 hrs time for 4-node runs, and then scale linearly
        entire_block += block

    entire_block = entire_block.replace("walltime: Inf", f"walltime: {int(walltime_hrs)}:00:00")

    return entire_block

In [5]:
from experiment_generator.experiment_generator import ExperimentGenerator
from ruamel.yaml import YAML

ryaml = YAML()
ryaml.preserve_quotes = True

generator_config = generate_esm1p6_layouts()
config_dict = ryaml.load(generator_config)
expgen = ExperimentGenerator(config_dict)
expgen.run()

Generated 4 layouts for 4.0 nodes. Layouts: [LayoutTuple(atm_nx=16, atm_ny=13, mom_nx=14, mom_ny=14, ice_ncores=12), LayoutTuple(atm_nx=16, atm_ny=14, mom_nx=15, mom_ny=12, ice_ncores=12), LayoutTuple(atm_nx=16, atm_ny=13, mom_nx=15, mom_ny=13, ice_ncores=12), LayoutTuple(atm_nx=16, atm_ny=13, mom_nx=16, mom_ny=12, ice_ncores=12)]
-- Test directory access-esm1.6_scaling_layouts has been created!
Cloned repository from git@github.com:ACCESS-NRI/access-esm1.6-configs.git to directory: /home/593/ms2335/codes/scaling_studies/access-models-scaling/access-esm1.6_scaling_layouts/access-esm1.6-PI-config
Created and checked out new branch: ctrl
laboratory path:  /scratch/tm70/ms2335/access-esm
binary path:  /scratch/tm70/ms2335/access-esm/bin
input path:  /scratch/tm70/ms2335/access-esm/input
work path:  /scratch/tm70/ms2335/access-esm/work
archive path:  /scratch/tm70/ms2335/access-esm/archive
Updated metadata. Experiment UUID: 22561c0f-72b4-4024-b385-a772e2c8301c
Added archive symlink to /scr

## Running the model

Next we create a runner config and run the experiments with `experiment-runner`.

In [6]:
config_yaml_name="esm1p6_layout_input_config.yaml"
with open(config_yaml_name, "w") as f:
    f.write(generator_config)
print(f"Generator config written to {config_yaml_name}")

Generator config written to esm1p6_layout_input_config.yaml


In [7]:
from experiment_runner.experiment_runner import ExperimentRunner

pe_expts = config_dict["Perturbation_Experiment"]
all_branches = [x for k in pe_expts for x in pe_expts[k]["branches"]]    

runner_config = {
    "test_path": test_path,
    "repository_directory": repository_directory,
    # these branches need to match branches in the experiment generator config above.
    "running_branches": all_branches,
    "keep_uuid": True,
    "nruns": [3]*len(all_branches),
    "startfrom_restart": ["cold"]*len(all_branches),
}

# Run the experiment runner
ExperimentRunner(runner_config).run()

* Current Branch: esm1p6-layout-unused-cores-to-cice-False_atm_16x13_mom_16x12_ice_12x1
    experiment_uuid: f81ae85e-e40c-43cc-a9c9-90ccfba29379
Branch: ctrl
    experiment_uuid: 22561c0f-72b4-4024-b385-a772e2c8301c
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_16x13_mom_14x14_ice_12x1
    experiment_uuid: 48bd5259-ed74-4e86-924a-80a62ccf08fb
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_16x13_mom_15x13_ice_12x1
    experiment_uuid: 571a380a-066a-4e54-87dc-fb6791b30d58
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_16x14_mom_15x12_ice_12x1
    experiment_uuid: 0f7a5b08-7c7a-4c9f-9417-a541b8ae2f36
Branch: main
    No config file found
-- Cloning branch 'esm1p6-layout-unused-cores-to-cice-False_atm_16x13_mom_14x14_ice_12x1' into access-esm1.6_scaling_layouts/esm1p6-layout-unused-cores-to-cice-False_atm_16x13_mom_14x14_ice_12x1/access-esm1.6-PI-config...
Cloned repository from access-esm1.6_scaling_layouts/access-esm1.6-PI-config to directory: /home/593/ms2335/codes/s

## Parsing and plotting experiment results

We'll use the profiling parsers in access-profiling to do this.

In [None]:
# Start by processing FMS (MOM5) results
from access.profiling import FMSProfilingParser
from access.profiling import PayuJSONProfilingParser
fmsparser = FMSProfilingParser(has_hits=False)
payuparser = PayuJSONProfilingParser()

In [None]:
# Processing logs
from pathlib import Path
import glob

fmslogs = []
payulogs = []
for nnodes in (1, 2, 4, 8):
    archive_dir = f"{test_path}/{nnodes}nodes/esm1.6_PI_Concentrations/archive"
    with open(archive_dir + "/output000/access-esm1.6.out", "r") as f:
        fmslogs.append(f.read())
    payulog = glob.glob(archive_dir + "/payu_jobs/0/run/*.gadi-pbs.json")[0]
    with open(payulog, "r") as f:
        payulogs.append(f.read())

In [None]:
mom5_stats = fmsparser.parse_data_series(
    fmslogs,
    "ncpus",
    [104, 208, 416, 832]
)
mom5_stats

In [None]:
payu_stats = payuparser.parse_data_series(
    payulogs,
    "ncpus",
    [104, 208, 416, 832]
)
payu_stats.region

In [None]:
from access.profiling.scaling import plot_scaling_metrics
from access.profiling.metrics import tmax

# plots the Ocean region and tmax metric from the FMS stats, and
# payu_model_run_duration_seconds region and walltime metric from payu stats.
plot_scaling_metrics(
    stats=[mom5_stats, payu_stats],
    regions=[
        ["Ocean"],
        ["payu_model_run_duration_seconds"]
    ],
    metric=tmax,
)