In [3]:
%cd /mnt/models/mreso/monarch/examples/

/mnt/models/mreso/monarch/examples


[-]E1007 18:32:49.758845 3758455 hyperactor/src/mailbox.rs:335] name:undelivered_message_attempt, sender:foo_client[0].client[0], dest:_1tX5BHYRHwRr[0].comm[0][13147652568889606402], error:broken link: failed to enqueue in MailboxClient, return_handle:foo_client[0].client[0]<hyperactor::mailbox::undeliverable::Undeliverable<hyperactor::mailbox::MessageEnvelope>>


In [None]:


# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

# @noautodeps
# pyre-ignore-all-errors
import logging
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim

from monarch.tools import commands
from monarch.actor import Actor, current_rank, endpoint
from monarch.utils import setup_env_for_distributed
from torch.nn.parallel import DistributedDataParallel as DDP
from slurm.utils_with_init import (
    get_appdef, 
    get_server_info, 
    create_proc_mesh,
)

os.environ["RUST_BACKTRACE"] = "full"
os.environ["RUST_LOG"] = "debug"


logging.basicConfig(
    level=logging.DEBUG,
    format="%(name)s %(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True,
)


logger: logging.Logger = logging.getLogger(__name__)

class BarrierActor(Actor):
    """This Actor wraps the basic functionality from Torch's DDP example.

    Conveniently, all of the methods we need are already laid out for us,
    so we can just wrap them in the usual Actor endpoint semantic with some
    light modifications.

    Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case
    """

    def __init__(self):
        os.environ["NCCL_DEBUG"] = "INFO"
        self.rank = current_rank().rank
        #self.local_rank = self.rank % 8  # Local GPU ID (0-7)
        self.local_rank = int(self.rank % 8)  # Local GPU ID

    def _rprint(self, msg):
        """Helper method to print with rank information."""
        print(f"{self.rank=} {msg}")

    @endpoint
    async def setup(self):
        """Initialize the PyTorch distributed process group."""
        self._rprint("Initializing torch distributed")
        # Set GPU device BEFORE dist.init_process_group
        torch.cuda.set_device(self.local_rank)
        self._rprint(f"Set GPU device to {self.local_rank}")


        WORLD_SIZE = int(os.environ["WORLD_SIZE"])
        MASTER_ADDR = os.environ.get("MASTER_ADDR", "localhost")
        MASTER_PORT = os.environ.get("MASTER_PORT", "12355")
        RANK = int(os.environ.get("RANK", "0"))
        print(f"MASTER_ADDR: {MASTER_ADDR}, MASTER_PORT: {MASTER_PORT}, RANK: {RANK}")
        # initialize the process group
        dist.init_process_group(
                    backend="nccl",
                    init_method=f"tcp://{MASTER_ADDR}:{MASTER_PORT}",
                    world_size=WORLD_SIZE,
                    rank=RANK
                )
        self._rprint("Finished initializing torch distributed")

    @endpoint
    async def cleanup(self):
        """Clean up the PyTorch distributed process group."""
        self._rprint("Cleaning up torch distributed")
        dist.destroy_process_group()

    @endpoint
    async def demo_basic(self):
        """Run a basic DDP training example."""
        self._rprint(f"{os.environ['NCCL_DEBUG']=}")
        torch.cuda.set_device(self.local_rank)
        self._rprint("Running basic DDP example")
        self._rprint(f"{torch.cuda.device_count()=}")
        self._rprint(f"{torch.cuda.current_device()=}")
        self._rprint(f"{torch.cuda.get_device_name(0)=}")
        self._rprint(f"{torch.cuda.is_initialized()=}")
        t = current_rank().rank * torch.ones(1).cuda()
        torch.distributed.all_reduce(t)
        self._rprint(f"{t=}")
        self._rprint("Finished running basic DDP example")


async def main():
    num_hosts = 2
    appdef = await get_appdef(num_hosts)
    server_info = await get_server_info(appdef)

    try:
        print("CREATE PROC MESH")
        proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)
        
        await proc_mesh.logging_option(
            stream_to_client=True,
        )

        print("SPAWN ACTORS")
        barrier_actor = proc_mesh.spawn("barrier_actor", BarrierActor)
        print("SETUP ENV")
        await setup_env_for_distributed(proc_mesh)
        print("SETUP CALL")
        await barrier_actor.setup.call()
        print("BASIC DEMO CALL")
        await barrier_actor.demo_basic.call()
        print("CLEAUP CALL")
        await barrier_actor.cleanup.call()

        print("DDP example completed successfully!")

    finally:
        commands.kill(f"slurm:///{server_info.name}")


if __name__ == "__main__":
    await main()

slurm.utils_with_init 2025-10-07 18:32:52 INFO Excluding SLURM nodes: chi2599,chi2600,chi2602,chi2603
torchx.schedulers.slurm_scheduler 2025-10-07 18:32:52 INFO unable to get job info for `monarch-root` with `squeue` (squeue: error: Invalid job id: monarch-root
), trying `sacct`
torchx.schedulers.slurm_scheduler 2025-10-07 18:32:52 INFO unable to get job info for `monarch-root` with `sacct` (sacct: fatal: Bad job/step specified: monarch-root
)
monarch.tools.commands 2025-10-07 18:32:52 INFO no existing RUNNING server `slurm:///monarch-root` creating new one...
torchx.runner.api 2025-10-07 18:32:52 INFO Tracker configurations: {}
torchx.runner.api 2025-10-07 18:32:52 INFO Checking for changes in workspace `/root/.monarch/out/tmpvl0lma6e/workspace`...
torchx.runner.api 2025-10-07 18:32:52 INFO To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.
torchx.runner.api 2025-10-07 18:32:52 INFO Reusing original image `monarch_default_workspace:latest` for role

app_id='monarch-root'
slurm_24_11_0=(24, 11)
slurm_version=(21, 8)
script='#!/bin/bash\n#\n# Generated by TorchX 0.8.0dev0\n# Run with: sbatch --parsable /tmp/tmpk_463xbj/torchx-sbatch.sh\n#\n#SBATCH --job-name=mesh0-0 --requeue --exclude=chi2599,chi2600,chi2602,chi2603 --ntasks-per-node=1 --cpus-per-task=64 --mem=2062607 --gpus-per-node=8\n\nset -evx\n\nexport PYTHONUNBUFFERED=1\nexport SLURM_UNBUFFEREDIO=1\nexport TORCHX_MAX_RETRIES=0\n\nset +e\nsrun --output=slurm-"$SLURM_JOB_ID"-mesh0-0.out --error=slurm-"$SLURM_JOB_ID"-mesh0-0.err --wait=60 --kill-on-bad-exit=1 --export=ALL,WORKSPACE_DIR=monarch_default_workspace:latest/workspace,PYTHONPATH=monarch_default_workspace:latest/workspace/,CONDA_DIR=monarch_default_workspace:latest/conda,TORCHX_JOB_ID=slurm:///"$SLURM_JOB_ID",TORCHX_INTERNAL_SESSION_ID=23c5c483-75ed-4b7e-ad58-1802e7b3597f process_allocator --port=26600 --program=/mnt/models/mreso/monarch/examples/custom_bootstrap_exec.sh\nexitcode=$?\nset -e\n\necho "job exited with cod

slurm.utils_with_init 2025-10-07 18:32:57 INFO 
===== Server Info =====
{
  "name": "1588",
  "server_handle": "slurm:///1588",
  "state": "RUNNING",
  "meshes": {
    "mesh0": {
      "host_type": "__UNSET__",
      "hosts": 2,
      "gpus": -1,
      "hostnames": [
        "chi2605"
      ]
    }
  }
}
asyncio 2025-10-07 18:32:57 DEBUG Using selector: EpollSelector
monarch._src.actor.allocator 2025-10-07 18:32:57 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints
monarch._src.actor.allocator 2025-10-07 18:32:57 INFO found a single proc mesh `mesh0` in slurm:///1588, will allocate on it
monarch.tools.network 2025-10-07 18:32:57 INFO no AF_INET6 address that can bind TCP sockets for `chi2605:26600` (error: [Errno -3] Temporary failure in name resolution)
monarch.tools.network 2025-10-07 18:32:57 INFO resolved AF_INET address `45.63.75.210:26600` for `chi2605:26600`
monarch._src.actor.allocator 2025-10-07 18:32:57 INFO initializing alloc on remote alloca

app_id='1588'
output='{\n  "meta": {\n    "plugin": {\n      "type": "openapi\\/v0.0.37",\n      "name": "Slurm OpenAPI v0.0.37"\n    },\n    "Slurm": {\n      "version": {\n        "major": 21,\n        "micro": 5,\n        "minor": 8\n      },\n      "release": "21.08.5"\n    }\n  },\n  "errors": [\n  ],\n  "jobs": [\n    {\n      "account": "",\n      "accrue_time": 1759861844,\n      "admin_comment": "",\n      "array_job_id": 0,\n      "array_task_id": null,\n      "array_max_tasks": 0,\n      "array_task_string": "",\n      "association_id": 0,\n      "batch_features": "",\n      "batch_flag": true,\n      "batch_host": "chi2605",\n      "flags": [\n        "TRES_STR_CALC",\n        "JOB_CPUS_SET ",\n        "JOB_MEM_SET"\n      ],\n      "burst_buffer": "",\n      "burst_buffer_state": "",\n      "cluster": "vultr-mi325x-torch",\n      "cluster_features": "",\n      "command": "\\/tmp\\/tmp5bw91xuc\\/torchx-sbatch.sh",\n      "comment": "",\n      "contiguous": false,\n      "co

[DEBUG] start_comm_watcher: ordered_hosts.len() = 1
[DEBUG] start_comm_watcher: ordered_hosts = [RemoteProcessAllocHost { id: "45.63.75.210", hostname: "45.63.75.210" }]
[DEBUG] start_comm_watcher: host_states.len() = 1
[DEBUG] start_comm_watcher: host_states keys = ["45.63.75.210"]
[DEBUG] start_comm_watcher: looking up host.id = "45.63.75.210"


SPAWN ACTORS
SETUP ENV
SETUP CALL
BASIC DEMO CALL
CLEAUP CALL
DDP example completed successfully!
app_id='1588'
output='{\n  "meta": {\n    "plugin": {\n      "type": "openapi\\/v0.0.37",\n      "name": "Slurm OpenAPI v0.0.37"\n    },\n    "Slurm": {\n      "version": {\n        "major": 21,\n        "micro": 5,\n        "minor": 8\n      },\n      "release": "21.08.5"\n    }\n  },\n  "errors": [\n  ],\n  "jobs": [\n    {\n      "account": "",\n      "accrue_time": 1759861844,\n      "admin_comment": "",\n      "array_job_id": 0,\n      "array_task_id": null,\n      "array_max_tasks": 0,\n      "array_task_string": "",\n      "association_id": 0,\n      "batch_features": "",\n      "batch_flag": true,\n      "batch_host": "chi2605",\n      "flags": [\n        "TRES_STR_CALC",\n        "JOB_CPUS_SET ",\n        "JOB_MEM_SET"\n      ],\n      "burst_buffer": "",\n      "burst_buffer_state": "",\n      "cluster": "vultr-mi325x-torch",\n      "cluster_features": "",\n      "command": "\\/t

[-]E1007 18:33:48.728597 3758455 hyperactor/src/channel/net.rs:875] error_msg:session tcp:45.63.75.210:45833.7514788451799728266: failed to deliver message within timeout
[-]E1007 18:33:51.416762 3758455 hyperactor/src/channel/net.rs:875] error_msg:session tcp:45.63.75.210:26600.962540729393715628: failed to deliver message within timeout
