In [1]:
import os
# need to set before importing monarch
os.environ["MONARCH_FILE_LOG"] = "debug"
os.environ["HYPERACTOR_MESH_ENABLE_LOG_FORWARDING"] = "true"
os.environ["HYPERACTOR_MESH_ENABLE_FILE_CAPTURE"] = "true"
os.environ["HYPERACTOR_MESH_TAIL_LOG_LINES"] = "100"

import socket
import subprocess
import sys
import time

from utils import get_host_ip_addr, bootstrap_addr
from monarch.actor import Actor, enable_transport, endpoint
from monarch._src.actor.bootstrap import attach_to_workers

class Hello(Actor):
    @endpoint
    def hello(self) -> str:
        print("HELLO!")
        return "echo"


port = 26600
host_ip_addr = get_host_ip_addr(addr_type="public")
enable_transport(f"tcp://{host_ip_addr}:{port}@tcp://0.0.0.0:{port}")

In [None]:
# demo that we can start a bootstrap on this host, by running it in different process.
# good for quick validation.

print(f"current PID is: {os.getpid()}")

#  worker and client can use the same port if they are on different hosts.
worker_port = 26601

python_command = f'from utils import bootstrap; bootstrap({worker_port}, "public")'
ip = get_host_ip_addr(addr_type="public")
worker_addr = bootstrap_addr(ip, worker_port)

proc = subprocess.Popen(
    [
        sys.executable,
        "-c",
        python_command,
    ],
    env={
        "MONARCH_FILE_LOG": "debug",
        "HYPERACTOR_MESH_ENABLE_LOG_FORWARDING": "true",
        "HYPERACTOR_MESH_ENABLE_FILE_CAPTURE": "true",
        "HYPERACTOR_MESH_TAIL_LOG_LINES": "100",
    },
    start_new_session=True,
)

print(f"a worker host is running on pid {proc.pid}")

host_mesh = attach_to_workers(
    name="host_mesh", ca="trust_all_connections", workers=[worker_addr]
)

proc_mesh = host_mesh.spawn_procs()
await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)

hello = proc_mesh.spawn("hello", Hello)
for i in range(100):
    hello.hello.call().get()

time.sleep(10)

for i in range(100):
    hello.hello.call().get()

time.sleep(10)

proc_mesh.stop().get()
host_mesh.shutdown().get()
print("done")


In [2]:
from mmt_utils import launch_mmt_job

job, studio = launch_mmt_job(
    num_nodes=4,
    mmt_job_name="ali_cpu_monarch | 0.2.0rc1 | 04",
    port=26600,
)

print(f"Job launched. You can monitor it using: job.status")
print(f"To stop the job: job.stop()")
print(f"To clean up: studio.stop()")

Job has not been created by the user
Launching MMT job with 4 nodes...


INFO - Multi-Machine Job was successfully launched. View it at https://lightning.ai/meta-ai/general/jobs/ali_cpu_monarch | 0.2.0rc1 | 04?app_id=mmt


Job started with ID: ali_cpu_monarch | 0.2.0rc1 | 04
Job status: Pending
Job launched. You can monitor it using: job.status
To stop the job: job.stop()
To clean up: studio.stop()


In [3]:
job.status

<Status.Running: 'Running'>

In [4]:
port = 26600

ip_addresses_list_public = [machine.public_ip for machine in job.machines]
print(ip_addresses_list_public)
worker_addrs = [f"tcp://{ip}:{port}@tcp://0.0.0.0:{port}" for ip in ip_addresses_list_public]
print(worker_addrs)


['35.224.235.249', '34.30.180.30', '34.46.93.138', '34.44.251.41']
['tcp://35.224.235.249:26600@tcp://0.0.0.0:26600', 'tcp://34.30.180.30:26600@tcp://0.0.0.0:26600', 'tcp://34.46.93.138:26600@tcp://0.0.0.0:26600', 'tcp://34.44.251.41:26600@tcp://0.0.0.0:26600']


In [5]:
host_mesh = attach_to_workers(
    name="host_mesh", ca="trust_all_connections", workers=worker_addrs
)

proc_mesh = host_mesh.spawn_procs(per_host={"gpus": 8})
await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)

Monarch internal logs are being written to /tmp/alisol/monarch_log.log; execution id alisol_Dec-19_06:59_175


In [6]:
proc_mesh

<monarch._src.actor.v1.proc_mesh.ProcMesh at 0x76a940bbb770>

In [7]:
actor_mesh = proc_mesh.spawn("hello", Hello)
actor_mesh.hello.call().get()
time.sleep(30)
actor_mesh.hello.call().get()

[36m>>> Aggregated Logs (2025-12-19 07:03:57) >>>[0m
[33m[11 similar log lines][0m [6] HELLO!
[33m[4 similar log lines][0m [30] HELLO!
[33m[10 similar log lines][0m [28] HELLO!
[33m[7 similar log lines][0m [15] HELLO!
[36m<<< Aggregated Logs (2025-12-19 07:04:02) <<<[0m

[36m>>> Aggregated Logs (2025-12-19 07:04:02) >>>[0m
[33m[1 similar log lines][0m [0] HELLO!
[36m<<< Aggregated Logs (2025-12-19 07:04:30) <<<[0m



ValueMesh({hosts: 4, gpus: 8}):
  (({'hosts': 0/4, 'gpus': 0/8}, 'echo'),
   ({'hosts': 0/4, 'gpus': 1/8}, 'echo'),
   ({'hosts': 0/4, 'gpus': 2/8}, 'echo'),
   ({'hosts': 0/4, 'gpus': 3/8}, 'echo'),
   ({'hosts': 0/4, 'gpus': 4/8}, 'echo'),
   ({'hosts': 0/4, 'gpus': 5/8}, 'echo'),
   ({'hosts': 0/4, 'gpus': 6/8}, 'echo'),
   ({'hosts': 0/4, 'gpus': 7/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 0/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 1/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 2/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 3/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 4/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 5/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 6/8}, 'echo'),
   ({'hosts': 1/4, 'gpus': 7/8}, 'echo'),
   ({'hosts': 2/4, 'gpus': 0/8}, 'echo'),
   ({'hosts': 2/4, 'gpus': 1/8}, 'echo'),
   ({'hosts': 2/4, 'gpus': 2/8}, 'echo'),
   ({'hosts': 2/4, 'gpus': 3/8}, 'echo'),
   ({'hosts': 2/4, 'gpus': 4/8}, 'echo'),
   ({'hosts': 2/4, 'gpus': 5/8}, 'echo'),
   ({'hosts': 2/4, 'gpus': 6/8}, 'echo'),
  

[36m>>> Aggregated Logs (2025-12-19 07:04:30) >>>[0m
[33m[10 similar log lines][0m [3] HELLO!
[33m[12 similar log lines][0m [24] HELLO!
[33m[9 similar log lines][0m [12] HELLO!
[36m<<< Aggregated Logs (2025-12-19 07:04:31) <<<[0m



In [8]:
for _i in range(100):
    actor_mesh.hello.call().get()
time.sleep(10)
for _i in range(100):
    actor_mesh.hello.call().get()

[36m>>> Aggregated Logs (2025-12-19 07:04:31) >>>[0m
[33m[1 similar log lines][0m [3] HELLO!
[36m<<< Aggregated Logs (2025-12-19 07:05:09) <<<[0m

[36m>>> Aggregated Logs (2025-12-19 07:05:09) >>>[0m
[33m[1800 similar log lines][0m [2] HELLO!
[33m[1200 similar log lines][0m [10] HELLO!
[33m[199 similar log lines][0m [31] HELLO!
[36m<<< Aggregated Logs (2025-12-19 07:05:12) <<<[0m

[36m>>> Aggregated Logs (2025-12-19 07:05:12) >>>[0m
[33m[1 similar log lines][0m [6] HELLO!
[36m<<< Aggregated Logs (2025-12-19 07:05:20) <<<[0m

[36m>>> Aggregated Logs (2025-12-19 07:05:20) >>>[0m
[33m[702 similar log lines][0m [7] HELLO!
[33m[1200 similar log lines][0m [13] HELLO!
[33m[998 similar log lines][0m [24] HELLO!
[33m[299 similar log lines][0m [30] HELLO!
[36m<<< Aggregated Logs (2025-12-19 07:05:22) <<<[0m



In [9]:
actor_mesh.stop().get()

()

In [10]:
host_mesh.shutdown().get()