In [1]:
%cd /mnt/models/mreso/monarch/examples/

/mnt/models/mreso/monarch/examples


In [None]:


# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

# @noautodeps
# pyre-ignore-all-errors
import logging
import os
import os
import torch
import torch.distributed as dist
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim

from monarch.tools import commands
from monarch.actor import Actor, current_rank, endpoint
from monarch.actor import Actor, current_rank, endpoint
from monarch.utils import setup_env_for_distributed
from torch.nn.parallel import DistributedDataParallel as DDP
from slurm.utils import get_appdef, get_server_info, create_proc_mesh

os.environ["RUST_BACKTRACE"] = "full"
os.environ["RUST_LOG"] = "debug"


logging.basicConfig(
    level=logging.DEBUG,
    format="%(name)s %(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True,
)


logger: logging.Logger = logging.getLogger(__name__)


class ToyModel(nn.Module):
    """A simple toy model for demonstration purposes."""

    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10000000)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10000000, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))


class DDPActor(Actor):
    """This Actor wraps the basic functionality from Torch's DDP example.

    Conveniently, all of the methods we need are already laid out for us,
    so we can just wrap them in the usual Actor endpoint semantic with some
    light modifications.

    Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case
    """

    def __init__(self):
        self.rank = current_rank().rank

    def _rprint(self, msg):
        """Helper method to print with rank information."""
        print(f"{self.rank=} {msg}")

    @endpoint
    async def setup(self):
        """Initialize the PyTorch distributed process group."""
        self._rprint("Initializing torch distributed")

        WORLD_SIZE = int(os.environ["WORLD_SIZE"])
        # initialize the process group
        dist.init_process_group("nccl", rank=self.rank, world_size=WORLD_SIZE)
        self._rprint("Finished initializing torch distributed")

    @endpoint
    async def cleanup(self):
        """Clean up the PyTorch distributed process group."""
        self._rprint("Cleaning up torch distributed")
        dist.destroy_process_group()

    @endpoint
    async def demo_basic(self):
        """Run a basic DDP training example."""
        self._rprint("Running basic DDP example")

        # create model and move it to GPU with id rank
        local_rank = int(os.environ["LOCAL_RANK"])
        self._rprint(f"{local_rank=}")
        model = ToyModel().to(local_rank)
        ddp_model = DDP(model, device_ids=[local_rank])

        loss_fn = nn.MSELoss()
        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
        for iter in range(50):
            logger.info(f"{iter=}")

            optimizer.zero_grad()
            outputs = ddp_model(torch.randn(20, 10))
            labels = torch.randn(20, 5).to(local_rank)
            loss_fn(outputs, labels).backward()
            optimizer.step()

        print(f"{self.rank=} Finished running basic DDP example")


async def main():
    num_hosts = 2
    appdef = await get_appdef(num_hosts)
    server_info = await get_server_info(appdef)

    try:
        print("CREATE PROC MESH")
        proc_mesh = await create_proc_mesh(num_hosts, appdef, server_info)
        
        await proc_mesh.logging_option(
            stream_to_client=True,
        )

        print("SPAWN ACTORS")
        ddp_actor = proc_mesh.spawn("ddp_actor", DDPActor)
        print("SETUP ENV")
        await setup_env_for_distributed(proc_mesh)
        print("SETUP CALL")
        await ddp_actor.setup.call()
        print("BASIC DEMO CALL")
        await ddp_actor.demo_basic.call()
        print("CLEAUP CALL")
        await ddp_actor.cleanup.call()

        print("DDP example completed successfully!")

    finally:
        commands.kill(f"slurm:///{server_info.name}")


if __name__ == "__main__":
    await main()

torchx.schedulers.slurm_scheduler 2025-10-03 00:23:14 INFO unable to get job info for `monarch-root` with `squeue` (squeue: error: Invalid job id: monarch-root
), trying `sacct`
torchx.schedulers.slurm_scheduler 2025-10-03 00:23:14 INFO unable to get job info for `monarch-root` with `sacct` (sacct: fatal: Bad job/step specified: monarch-root
)
monarch.tools.commands 2025-10-03 00:23:14 INFO no existing RUNNING server `slurm:///monarch-root` creating new one...
torchx.runner.api 2025-10-03 00:23:14 INFO Tracker configurations: {}
torchx.runner.api 2025-10-03 00:23:14 INFO Checking for changes in workspace `/root/.monarch/out/tmpp678t76b/workspace`...
torchx.runner.api 2025-10-03 00:23:14 INFO To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.
torchx.runner.api 2025-10-03 00:23:14 INFO Reusing original image `monarch_default_workspace:latest` for role[0]=mesh0. Either a patch was built or no changes to workspace was detected.
monarch.tools.commands 20

app_id='monarch-root'
slurm_24_11_0=(24, 11)
slurm_version=(21, 8)
slurm_24_11_0=(24, 11)
slurm_version=(21, 8)
script='#!/bin/bash\n#\n# Generated by TorchX 0.8.0dev0\n# Run with: sbatch --parsable /tmp/tmpxjb7eqye/torchx-sbatch.sh\n#\n#SBATCH --job-name=mesh0-0 --requeue --ntasks-per-node=1 --cpus-per-task=64 --mem=2062607 --gpus-per-node=8\n#SBATCH hetjob\n#SBATCH --job-name=mesh0-1 --requeue --ntasks-per-node=1 --cpus-per-task=64 --mem=2062607 --gpus-per-node=8\n\nset -evx\n\nexport PYTHONUNBUFFERED=1\nexport SLURM_UNBUFFEREDIO=1\nexport TORCHX_MAX_RETRIES=0\n\nset +e\nsrun --output=slurm-"$SLURM_JOB_ID"-mesh0-0.out --error=slurm-"$SLURM_JOB_ID"-mesh0-0.err --wait=60 --kill-on-bad-exit=1 --export=ALL,WORKSPACE_DIR=monarch_default_workspace:latest/workspace,PYTHONPATH=monarch_default_workspace:latest/workspace/,CONDA_DIR=monarch_default_workspace:latest/conda,TORCHX_JOB_ID=slurm:///"$SLURM_JOB_ID",TORCHX_INTERNAL_SESSION_ID=ef0029ee-bb7c-4980-be0d-2d9380b052fa process_allocator --po

slurm.utils 2025-10-03 00:23:19 INFO 
===== Server Info =====
{
  "name": "1385",
  "server_handle": "slurm:///1385",
  "state": "RUNNING",
  "meshes": {
    "mesh0": {
      "host_type": "__UNSET__",
      "hosts": 6,
      "gpus": -1,
      "hostnames": [
        "chi2599",
        "chi2600"
      ]
    }
  }
}
asyncio 2025-10-03 00:23:19 DEBUG Using selector: EpollSelector
monarch._src.actor.allocator 2025-10-03 00:23:19 INFO no match label `procmesh.monarch.meta.com/name` specified in alloc constraints
monarch._src.actor.allocator 2025-10-03 00:23:19 INFO found a single proc mesh `mesh0` in slurm:///1385, will allocate on it
monarch.tools.network 2025-10-03 00:23:19 INFO no AF_INET6 address that can bind TCP sockets for `chi2599:26600` (error: [Errno -3] Temporary failure in name resolution)
monarch.tools.network 2025-10-03 00:23:19 INFO resolved AF_INET address `108.61.203.106:26600` for `chi2599:26600`
monarch.tools.network 2025-10-03 00:23:19 INFO no AF_INET6 address that can bi

app_id='1385'
output='{\n  "meta": {\n    "plugin": {\n      "type": "openapi\\/v0.0.37",\n      "name": "Slurm OpenAPI v0.0.37"\n    },\n    "Slurm": {\n      "version": {\n        "major": 21,\n        "micro": 5,\n        "minor": 8\n      },\n      "release": "21.08.5"\n    }\n  },\n  "errors": [\n  ],\n  "jobs": [\n    {\n      "account": "",\n      "accrue_time": 0,\n      "admin_comment": "",\n      "array_job_id": 0,\n      "array_task_id": null,\n      "array_max_tasks": 0,\n      "array_task_string": "",\n      "association_id": 0,\n      "batch_features": "",\n      "batch_flag": false,\n      "batch_host": "chi2611",\n      "flags": [\n        "JOB_CPUS_SET "\n      ],\n      "burst_buffer": "",\n      "burst_buffer_state": "",\n      "cluster": "vultr-mi325x-torch",\n      "cluster_features": "",\n      "command": "\\/bin\\/bash",\n      "comment": "",\n      "contiguous": false,\n      "core_spec": null,\n      "thread_spec": null,\n      "cores_per_socket": null,\n      

[DEBUG] start_comm_watcher: ordered_hosts.len() = 2
[DEBUG] start_comm_watcher: ordered_hosts = [RemoteProcessAllocHost { id: "108.61.203.106", hostname: "108.61.203.106" }, RemoteProcessAllocHost { id: "45.63.68.72", hostname: "45.63.68.72" }]
[DEBUG] start_comm_watcher: host_states.len() = 2
[DEBUG] start_comm_watcher: host_states keys = ["45.63.68.72", "108.61.203.106"]
[DEBUG] start_comm_watcher: looking up host.id = "108.61.203.106"
[DEBUG] start_comm_watcher: looking up host.id = "45.63.68.72"


SPAWN ACTORS
SETUP ENV
SETUP CALL
BASIC DEMO CALL
CLEAUP CALL
DDP example completed successfully!
app_id='1385'
output='{\n  "meta": {\n    "plugin": {\n      "type": "openapi\\/v0.0.37",\n      "name": "Slurm OpenAPI v0.0.37"\n    },\n    "Slurm": {\n      "version": {\n        "major": 21,\n        "micro": 5,\n        "minor": 8\n      },\n      "release": "21.08.5"\n    }\n  },\n  "errors": [\n  ],\n  "jobs": [\n    {\n      "account": "",\n      "accrue_time": 1759450195,\n      "admin_comment": "",\n      "array_job_id": 0,\n      "array_task_id": null,\n      "array_max_tasks": 0,\n      "array_task_string": "",\n      "association_id": 0,\n      "batch_features": "",\n      "batch_flag": true,\n      "batch_host": "chi2599",\n      "flags": [\n        "TRES_STR_CALC",\n        "JOB_CPUS_SET "\n      ],\n      "burst_buffer": "",\n      "burst_buffer_state": "",\n      "cluster": "vultr-mi325x-torch",\n      "cluster_features": "",\n      "command": "\\/root\\/nfs_models\\/john\