In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# ESM1.6 Scaling

End-to-end pipeline generating scaling plots for ESM1.6 Pre-industrial configuration.

## Setup
The following variables control where the configurations and model run outputs are saved.

In [7]:
import os
import getpass
username = getpass.getuser()
defaultproject = os.environ.get("PROJECT")

# CHANGE test_path to where you're comfortable dumping model run information
test_path = f"/scratch/{defaultproject}/{username}/access_models_scaling_results"

# Name of the experiment
model_type = "access-esm1.6"
test_path = f"{model_type}_scaling_layouts"
repository_directory = f"{model_type}-PI-config"


## Generate experiments

We will perform a simple scaling study, varying the total number of nodes used over [0.5, 1.0, 2.0, 4.0], i.e., we will run the model with 52. 104, 208 and 416 cores.

In [8]:
def get_node_configs():
    from access.config.esm1p6_layout_input import LayoutSearchConfig

    num_nodes_list = [0.5, 1.0, 2.0, 3.0, 5.0]
    tol_around_ctrl_ratio = [0.05]*len(num_nodes_list)
    # tol_around_ctrl_ratio[-1] = None # For the last 5-node config, allow to solve for other layouts using the defaults
    max_wasted_ncores_frac=[0.1 if x <= 1 else 0.05 if x <=3 else 0.02 for x in num_nodes_list]
    scaling_configs = [LayoutSearchConfig(tol_around_ctrl_ratio=t, max_wasted_ncores_frac=m) for t, m in zip(tol_around_ctrl_ratio, max_wasted_ncores_frac)]

    # num_nodes_list = [4.0]
    # tol_around_ctrl_ratio = [0.05]*len(num_nodes_list)
    # tol_around_ctrl_ratio[-1] = None # For the last 4-node config, allow to solve for other layouts using the defaults
    # max_wasted_ncores_frac=[0.1 if x <= 1 else 0.05 if x <=3 else 0.02 for x in num_nodes_list]
    # scaling_configs = [LayoutSearchConfig(tol_around_ctrl_ratio=t, max_wasted_ncores_frac=m) for t, m in zip(tol_around_ctrl_ratio, max_wasted_ncores_frac)]


    return num_nodes_list, scaling_configs


In [9]:

def generate_esm1p6_layouts() -> str:
    import sys

    from access.config.esm1p6_layout_input import generate_esm1p6_core_layouts_from_node_count, generate_esm1p6_perturb_block

    control_expriment_name = "Control_Experiment"
    generator_config_prefix = f"""
model_type: {model_type}
repository_url: git@github.com:ACCESS-NRI/{model_type}-configs.git
start_point: 1ebd393 # the commit hash that access-bot refers to when committing the new checksums

test_path: {test_path}
repository_directory: {repository_directory}

control_branch_name: ctrl

{control_expriment_name}:
    config.yaml:
        walltime: Inf
        modules:
            use:
                - /g/data/{defaultproject}/{username}/spack/0.22/release/modules/linux-rocky8-x86_64_v4
            load:
                - access-esm1p6/2025.09.002

        

        manifest:
            reproduce:
                exe: False    # cice5 has to be manually compiled because the runtime core counts is set at compile time
        repeat: True
        runspersub: 10

Perturbation_Experiment:
"""

    blocknum = 1
    queue = "normalsr"
    branch_name_prefix = "esm1p6-layout"
    entire_block = generator_config_prefix
    num_nodes_list, scaling_configs = get_node_configs()
    # num_nodes_list = [4.0]
    # from access.config.esm1p6_layout_input import LayoutSearchConfig
    # scaling_configs = [LayoutSearchConfig(tol_around_ctrl_ratio=None, max_wasted_ncores_frac=0.02)]*len(num_nodes_list)

    seen_layouts = set()
    walltime_hrs = 0.0
    for num_nodes, config in zip(num_nodes_list, scaling_configs, strict=True):
        layout = generate_esm1p6_core_layouts_from_node_count(
            num_nodes,
            queue=queue,
            layout_search_config=config,
        )[0]
        if not layout:
            print(f"No layouts found for {num_nodes} nodes", file=sys.stderr)
            continue

        layout = [x for x in layout if x not in seen_layouts]
        seen_layouts.update(layout)
        print(f"Generated {len(layout)} layouts for {num_nodes} nodes. Layouts: {layout}")

        branch_name = f"{branch_name_prefix}-unused-cores-to-cice-{config.allocate_unused_cores_to_ice}"
        prev_blocknum = blocknum
        block, blocknum = generate_esm1p6_perturb_block(
            num_nodes, layout, branch_name, queue=queue, start_blocknum=blocknum,
        )
        nblocks_added = blocknum - prev_blocknum
        walltime_hrs += nblocks_added * (1.5 * 4.0/num_nodes) # use a 1.5 hrs time for 4-node runs, and then scale linearly
        entire_block += block

    entire_block = entire_block.replace("walltime: Inf", f"walltime: {int(walltime_hrs)}:00:00")

    return entire_block

In [10]:
from experiment_generator.experiment_generator import ExperimentGenerator
from ruamel.yaml import YAML

ryaml = YAML()
ryaml.preserve_quotes = True

generator_config = generate_esm1p6_layouts()
config_dict = ryaml.load(generator_config)
expgen = ExperimentGenerator(config_dict)
expgen.run()

Min. total cores required for a valid config (47) should be greater than the number of ATM + OCN cores (51). Currently, any config that satisfies the ATM + OCN core requirements will also satisfy the requirement for the min. total cores
Min. total cores required for a valid config (94) should be greater than the number of ATM + OCN cores (101). Currently, any config that satisfies the ATM + OCN core requirements will also satisfy the requirement for the min. total cores
Min. total cores required for a valid config (198) should be greater than the number of ATM + OCN cores (202). Currently, any config that satisfies the ATM + OCN core requirements will also satisfy the requirement for the min. total cores


Min. total cores required for a valid config (297) should be greater than the number of ATM + OCN cores (303). Currently, any config that satisfies the ATM + OCN core requirements will also satisfy the requirement for the min. total cores


Generated 1 layouts for 0.5 nodes. Layouts: [LayoutTuple(atm_nx=6, atm_ny=4, mom_nx=6, mom_ny=4, ice_ncores=1)]
Generated 2 layouts for 1.0 nodes. Layouts: [LayoutTuple(atm_nx=8, atm_ny=6, mom_nx=8, mom_ny=6, ice_ncores=3), LayoutTuple(atm_nx=8, atm_ny=6, mom_nx=9, mom_ny=5, ice_ncores=3)]
Generated 4 layouts for 2.0 nodes. Layouts: [LayoutTuple(atm_nx=10, atm_ny=10, mom_nx=10, mom_ny=10, ice_ncores=6), LayoutTuple(atm_nx=10, atm_ny=10, mom_nx=11, mom_ny=9, ice_ncores=6), LayoutTuple(atm_nx=10, atm_ny=10, mom_nx=12, mom_ny=8, ice_ncores=6), LayoutTuple(atm_nx=12, atm_ny=8, mom_nx=12, mom_ny=8, ice_ncores=6)]
Generated 3 layouts for 3.0 nodes. Layouts: [LayoutTuple(atm_nx=14, atm_ny=11, mom_nx=12, mom_ny=12, ice_ncores=9), LayoutTuple(atm_nx=14, atm_ny=11, mom_nx=13, mom_ny=11, ice_ncores=9), LayoutTuple(atm_nx=14, atm_ny=11, mom_nx=14, mom_ny=10, ice_ncores=9)]
Generated 2 layouts for 5.0 nodes. Layouts: [LayoutTuple(atm_nx=18, atm_ny=14, mom_nx=18, mom_ny=14, ice_ncores=15), LayoutTup

## Write the config into a file


In [6]:
config_yaml_name="esm1p6_layout_input_config_other_nodes.yaml"
with open(config_yaml_name, "w") as f:
    f.write(generator_config)
print(f"Generator config written to {config_yaml_name}")

Generator config written to esm1p6_layout_input_config_other_nodes.yaml


## Running the model

Next we create a runner config and run the experiments with `experiment-runner`.

In [7]:
from experiment_runner.experiment_runner import ExperimentRunner

pe_expts = config_dict["Perturbation_Experiment"]
all_branches = [x for k in pe_expts for x in pe_expts[k]["branches"]]    

runner_config = {
    "test_path": test_path,
    "repository_directory": repository_directory,
    # these branches need to match branches in the experiment generator config above.
    "running_branches": all_branches,
    "keep_uuid": True,
    "nruns": [3]*len(all_branches),
    "startfrom_restart": ["cold"]*len(all_branches),
}

# Run the experiment runner
ExperimentRunner(runner_config).run()

* Current Branch: esm1p6-layout-unused-cores-to-cice-False_atm_16x16_mom_16x15_ice_15x1
    experiment_uuid: 56b6751f-6fa1-4dab-8b8f-0603e16297e2
Branch: ctrl
    experiment_uuid: 22561c0f-72b4-4024-b385-a772e2c8301c
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_10x10_mom_10x10_ice_6x1
    experiment_uuid: 6ee0710a-b170-4d8f-8b68-94652d851d99
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_10x10_mom_11x9_ice_6x1
    experiment_uuid: 11795297-85a5-4315-8061-d8252c226865
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_10x10_mom_12x8_ice_6x1
    experiment_uuid: 1c46c4ff-b79c-4309-81d7-c5d8d37be036
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_12x8_mom_12x8_ice_6x1
    experiment_uuid: 68036896-51cf-453c-b4f7-ea179aa86071
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_14x11_mom_12x12_ice_9x1
    experiment_uuid: 269d2ddb-6028-4f1c-9626-76b01fcddda4
Branch: esm1p6-layout-unused-cores-to-cice-False_atm_14x11_mom_13x11_ice_9x1
    experiment_uuid: 3907d81a-e472-

## Parsing and plotting experiment results

We'll use the profiling parsers in access-profiling to do this.

In [None]:
# Start by processing FMS (MOM5) results
from access.profiling import FMSProfilingParser
from access.profiling import PayuJSONProfilingParser
fmsparser = FMSProfilingParser(has_hits=False)
payuparser = PayuJSONProfilingParser()

In [None]:
# Processing logs
from pathlib import Path
import glob

fmslogs = []
payulogs = []
for nnodes in (1, 2, 4, 8):
    archive_dir = f"{test_path}/{nnodes}nodes/esm1.6_PI_Concentrations/archive"
    with open(archive_dir + "/output000/access-esm1.6.out", "r") as f:
        fmslogs.append(f.read())
    payulog = glob.glob(archive_dir + "/payu_jobs/0/run/*.gadi-pbs.json")[0]
    with open(payulog, "r") as f:
        payulogs.append(f.read())

In [None]:
mom5_stats = fmsparser.parse_data_series(
    fmslogs,
    "ncpus",
    [104, 208, 416, 832]
)
mom5_stats

In [None]:
payu_stats = payuparser.parse_data_series(
    payulogs,
    "ncpus",
    [104, 208, 416, 832]
)
payu_stats.region

In [None]:
from access.profiling.scaling import plot_scaling_metrics
from access.profiling.metrics import tmax

# plots the Ocean region and tmax metric from the FMS stats, and
# payu_model_run_duration_seconds region and walltime metric from payu stats.
plot_scaling_metrics(
    stats=[mom5_stats, payu_stats],
    regions=[
        ["Ocean"],
        ["payu_model_run_duration_seconds"]
    ],
    metric=tmax,
)