In [1]:
%cd /mnt/models/mreso/monarch/examples/

/mnt/models/mreso/monarch/examples


In [2]:
import os
os.environ.setdefault("GPU_MAX_HW_QUEUES", "2")
os.environ.setdefault("TORCH_NCCL_HIGH_PRIORITY", "1")
os.environ.setdefault("NCCL_CHECKS_DISABLE", "1")
os.environ.setdefault("NCCL_IB_GID_INDEX", "3")
os.environ.setdefault("NCCL_CROSS_NIC", "0")
os.environ.setdefault("CUDA_DEVICE_MAX_CONNECTIONS", "1")
os.environ.setdefault("NCCL_PROTO", "Simple")
os.environ.setdefault("RCCL_MSCCL_ENABLE", "0")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("HSA_NO_SCRATCH_RECLAIM", "1")
os.environ.setdefault("NCCL_PXN_DISABLE", "0")
os.environ.setdefault("NCCL_P2P_NET_CHUNKSIZE", "262144")

'262144'

## Monarch + TorchTitan on SLURM
This example notebook demonstrates how you can easily run and iterate on a distributed training job with Monarch and TorchTitan.

#### Prerequisites
Please make sure your environment is setup for this notebook:
1. Install Monarch nightly: https://github.com/meta-pytorch/monarch/blob/main/scripts/install_nightly.py
2. Install Titan nightly: https://github.com/pytorch/torchtitan?tab=readme-ov-file#nightly-builds
3. Ensure you have a valid Titan model config in the script directory (i.e: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/train_configs/debug_model.toml)

### 1. Reserve your SLURM job
If necessary, update paramaters for your cluster:
- host_type: TorchX named resource for your cluster (default: "gpu.xlarge")
- host_memory: Memory per machine in MB (default: 2062607)

For more information on TorchX resources: https://docs.pytorch.org/torchx/main/specs.html#resource

In [None]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

from slurm.utils_with_init import get_appdef, get_server_info, create_proc_mesh


num_nodes = 2 # assign for your system
appdef = await get_appdef(
    num_nodes,
    # host_type = ...
)
server_info = await get_server_info(
    appdef,
    # host_memory = ...
)

torchx.schedulers.slurm_scheduler 2025-10-02 22:31:07 INFO unable to get job info for `monarch-root` with `squeue` (squeue: error: Invalid job id: monarch-root
), trying `sacct`
torchx.schedulers.slurm_scheduler 2025-10-02 22:31:07 INFO unable to get job info for `monarch-root` with `sacct` (sacct: fatal: Bad job/step specified: monarch-root
)
monarch.tools.commands 2025-10-02 22:31:07 INFO no existing RUNNING server `slurm:///monarch-root` creating new one...
torchx.runner.api 2025-10-02 22:31:07 INFO Tracker configurations: {}
torchx.runner.api 2025-10-02 22:31:07 INFO Checking for changes in workspace `/root/.monarch/out/tmp9e11pybo/workspace`...
torchx.runner.api 2025-10-02 22:31:07 INFO To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.
torchx.runner.api 2025-10-02 22:31:07 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.
monarch.tools.commands 20

app_id='monarch-root'
slurm_24_11_0=(24, 11)
slurm_version=(21, 8)
script='#!/bin/bash\n#\n# Generated by TorchX 0.8.0dev0\n# Run with: sbatch --parsable /tmp/tmp_df6eja0/torchx-sbatch.sh\n#\n#SBATCH --job-name=mesh0-0 --requeue --ntasks-per-node=1 --cpus-per-task=64 --mem=2062607 --gpus-per-node=8\n\nset -evx\n\nexport PYTHONUNBUFFERED=1\nexport SLURM_UNBUFFEREDIO=1\nexport TORCHX_MAX_RETRIES=0\n\nset +e\nsrun --output=slurm-"$SLURM_JOB_ID"-mesh0-0.out --error=slurm-"$SLURM_JOB_ID"-mesh0-0.err --wait=60 --kill-on-bad-exit=1 --export=ALL,WORKSPACE_DIR=monarch_default_workspace:latest/workspace,PYTHONPATH=monarch_default_workspace:latest/workspace/,CONDA_DIR=monarch_default_workspace:latest/conda,TORCHX_JOB_ID=slurm:///"$SLURM_JOB_ID",TORCHX_INTERNAL_SESSION_ID=f73b1624-c6bd-49ce-b0ef-6f5c1440c5cc process_allocator --port=26600 --program=/mnt/models/mreso/monarch/examples/custom_bootstrap.sh\nexitcode=$?\nset -e\n\necho "job exited with code $exitcode"\nif [ $exitcode -ne 0 ]; then\n   

### 2. Define your Titan and cluster parameters

In [4]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

from torchtitan.train import Trainer
from torchtitan.config import ConfigManager, JobConfig
from monarch.actor import Actor, current_rank, endpoint
from torchtitan.tools.logging import init_logger, logger
import torch
from dataclasses import dataclass
import os
from monarch.tools import commands
from monarch.utils import setup_env_for_distributed


@dataclass
class RunParams:
    """
        Parameters for your cluster and training job, adjust as needed
    """
    training_steps: int = 50
    model_config = "/mnt/models/mreso/torchtitan/torchtitan/models/llama3/train_configs/debug_model.toml"
    # model_config = "/mnt/models/mreso/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml"
    # model_config = "/mnt/models/mreso/torchtitan/torchtitan/models/llama3/train_configs/llama3_70b.toml"
    dataset = "c4"
    num_nodes = num_nodes
    gpus_per_node = 8


class TrainerActor(Actor):
    """
        A simple wrapper class with executes a TorchTitan trainer in a Monarch actor
    """
    def __init__(self, job_config: JobConfig) -> None:
        self.job_config = job_config
        rank = current_rank().rank
        self.uid = f"[trainer_{rank}]"

    @endpoint
    async def start_training(self) -> None:
        init_logger()
        trainer: Trainer | None = None

        try:
            trainer = Trainer(self.job_config)
            logger.info(f"{self.uid} initialized successfully and starting training")
            trainer.train()
        except Exception:
            if trainer:
                trainer.close()
            raise
        else:
            trainer.close()
        finally:
            torch.distributed.destroy_process_group()
            logger.info(f"{self.uid} trainer cleaned up")

def make_job_config() -> JobConfig:
    """
        Create a job config which is digested by TorchTitan, sourced from RunParams
    """
    data_parallel_shard_degree = RunParams.num_nodes * RunParams.gpus_per_node
    output_path = "./outputs"

    script_dir = globals()['_dh'][0]
    default_args = [
        "--job.config_file",
        os.path.join(script_dir, RunParams.model_config),
        "--model.tokenizer_path",
        "/mnt/models/mreso/torchtitan/tests/assets/tokenizer/",
        # "/mnt/models/mreso/torchtitan/assets/hf/Llama-3.1-8B/",
        # "/mnt/models/mreso/torchtitan/assets/hf/Llama-3.1-70B/",
        "--comm.trace_buf_size",
        "0",
        "--metrics.log_freq",
        "1",
        "--parallelism.data_parallel_shard_degree",
        str(-1),
        "--activation_checkpoint.mode",
        "full",
        "--comm.train_timeout_seconds",
        "60",
        "--training.steps",
        str(RunParams.training_steps),
        "--training.dataset",
        RunParams.dataset,
        "--job.dump_folder",
        output_path,
        "--metrics.enable_tensorboard",
    ]

    config_manager = ConfigManager()
    job_config = config_manager.parse_args(default_args)

    return job_config

### 3. Execute your training job
You can make adjustments and run this on the existing SLURM allocations as many times as you would like!

In [None]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

async def main():
    job_config = make_job_config()
    proc_mesh = None

    try:
        # 1. Create a proc mesh on your SLURM allocation
        print("CREATING PROC MESH")
        proc_mesh = await create_proc_mesh(RunParams.num_nodes, appdef, server_info)
        
        # 2. Define remote logging behavior
        await proc_mesh.logging_option(
            stream_to_client=True,
            aggregate_window_sec=None
        )
        # 3. Prepare trainer for torch distributed
        print("SETUP ENV FOR DISTRIBUTED")
        await setup_env_for_distributed(
            proc_mesh,
            )
        
        print("SPAWNING TRAINER")
        trainer = await proc_mesh.spawn("trainer_actor", TrainerActor, job_config)
        # 4. Execute the taining job
        print("CALLING TRAINER")
        await trainer.start_training.call()
    except Exception as e:
        logger.info(f"Trainer failed: {e}")
    finally:
        if proc_mesh:
            await proc_mesh.stop()


if __name__ == "__main__":
    await main()

slurm.utils 2025-10-02 22:31:20 INFO 
===== Server Info =====
{
  "name": "1354",
  "server_handle": "slurm:///1354",
  "state": "RUNNING",
  "meshes": {
    "mesh0": {
      "host_type": "__UNSET__",
      "hosts": 2,
      "gpus": -1,
      "hostnames": [
        "chi2599"
      ]
    }
  }
}
monarch._src.actor.allocator 2025-10-02 22:31:20 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints
monarch._src.actor.allocator 2025-10-02 22:31:20 INFO found a single proc mesh `mesh0` in slurm:///1354, will allocate on it
monarch.tools.network 2025-10-02 22:31:20 INFO no AF_INET6 address that can bind TCP sockets for `chi2599:26600` (error: [Errno -3] Temporary failure in name resolution)
monarch.tools.network 2025-10-02 22:31:20 INFO resolved AF_INET address `108.61.203.106:26600` for `chi2599:26600`
monarch._src.actor.allocator 2025-10-02 22:31:20 INFO initializing alloc on remote allocator addresses: ['tcp!108.61.203.106:26600']
monarch._src.actor.allocator

[DEBUG] start_comm_watcher: ordered_hosts.len() = 1
[DEBUG] start_comm_watcher: ordered_hosts = [RemoteProcessAllocHost { id: "108.61.203.106", hostname: "108.61.203.106" }]
[DEBUG] start_comm_watcher: host_states.len() = 1
[DEBUG] start_comm_watcher: host_states keys = ["108.61.203.106"]
[DEBUG] start_comm_watcher: looking up host.id = "108.61.203.106"


CREATING PROC MESH
app_id='1354'
output='{\n  "meta": {\n    "plugin": {\n      "type": "openapi\\/v0.0.37",\n      "name": "Slurm OpenAPI v0.0.37"\n    },\n    "Slurm": {\n      "version": {\n        "major": 21,\n        "micro": 5,\n        "minor": 8\n      },\n      "release": "21.08.5"\n    }\n  },\n  "errors": [\n  ],\n  "jobs": [\n    {\n      "account": "",\n      "accrue_time": 1759443920,\n      "admin_comment": "",\n      "array_job_id": 0,\n      "array_task_id": null,\n      "array_max_tasks": 0,\n      "array_task_string": "",\n      "association_id": 0,\n      "batch_features": "",\n      "batch_flag": true,\n      "batch_host": "chi2602",\n      "flags": [\n        "JOB_CPUS_SET ",\n        "JOB_WAS_RUNNING"\n      ],\n      "burst_buffer": "",\n      "burst_buffer_state": "",\n      "cluster": "vultr-mi325x-torch",\n      "cluster_features": "",\n      "command": "\\/mnt\\/models\\/xinyu\\/torchtitan-amd\\/run_slurm_pretrain.sh",\n      "comment": "",\n      "contiguo

root 2025-10-02 22:31:41 INFO Trainer failed: A remote actor call has failed.
 Traceback of where the remote call failed (most recent call last):
  File "/mnt/models/mreso/monarch/python/monarch/_src/actor/actor_mesh.py", line 948, in handle
    result = await instrumented()
             ^^^^^^^^^^^^^^^^^^^^
  File "/mnt/models/mreso/monarch/python/monarch/_src/actor/actor_mesh.py", line 945, in instrumented
    raise e
  File "/mnt/models/mreso/monarch/python/monarch/_src/actor/actor_mesh.py", line 938, in instrumented
    result = await the_method(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1429727/129428210.py", line 43, in start_training
    trainer = Trainer(self.job_config)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/models/mreso/monarch/miniforge3/envs/monarch/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^

### 4. Destory the SLURM job
Once you're done experimenting, free up the allocation

In [6]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

commands.kill(f"slurm:///{server_info.name}")

app_id='1354'
output='{\n  "meta": {\n    "plugin": {\n      "type": "openapi\\/v0.0.37",\n      "name": "Slurm OpenAPI v0.0.37"\n    },\n    "Slurm": {\n      "version": {\n        "major": 21,\n        "micro": 5,\n        "minor": 8\n      },\n      "release": "21.08.5"\n    }\n  },\n  "errors": [\n  ],\n  "jobs": [\n    {\n      "account": "",\n      "accrue_time": 1759443920,\n      "admin_comment": "",\n      "array_job_id": 0,\n      "array_task_id": null,\n      "array_max_tasks": 0,\n      "array_task_string": "",\n      "association_id": 0,\n      "batch_features": "",\n      "batch_flag": true,\n      "batch_host": "chi2602",\n      "flags": [\n        "JOB_CPUS_SET ",\n        "JOB_WAS_RUNNING"\n      ],\n      "burst_buffer": "",\n      "burst_buffer_state": "",\n      "cluster": "vultr-mi325x-torch",\n      "cluster_features": "",\n      "command": "\\/mnt\\/models\\/xinyu\\/torchtitan-amd\\/run_slurm_pretrain.sh",\n      "comment": "",\n      "contiguous": false,\n      