In [1]:
import os
# need to set before importing monarch
os.environ["MONARCH_FILE_LOG"] = "debug"
os.environ["HYPERACTOR_MESH_ENABLE_LOG_FORWARDING"] = "true"
os.environ["HYPERACTOR_MESH_ENABLE_FILE_CAPTURE"] = "true"
os.environ["HYPERACTOR_MESH_TAIL_LOG_LINES"] = "100"

import socket
import subprocess
import sys
import time

from utils import get_host_ip_addr, bootstrap_addr
from monarch.actor import Actor, enable_transport, endpoint
from monarch._src.actor.bootstrap import attach_to_workers


port = 26600
host_ip_addr = get_host_ip_addr(addr_type="public")
enable_transport(f"tcp://{host_ip_addr}:{port}@tcp://0.0.0.0:{port}")

In [None]:
from mmt_utils import launch_mmt_job

NUM_NODES = 2
NUM_GPUS = 8

job, studio = launch_mmt_job(
    num_nodes=NUM_NODES,
    mmt_job_name="ali_titan_monarch | 0.2.0 stable",
    port=26600,
    num_gpus=NUM_GPUS,
)

print(f"Job launched. You can monitor it using: job.status")
print(f"To stop the job: job.stop()")
print(f"To clean up: studio.stop()")

Job has not been created by the user
Launching MMT job with 2 nodes...


INFO - Multi-Machine Job was successfully launched. View it at https://lightning.ai/meta-ai/general/jobs/ali_titan_monarch | 0.2.0rc1 | 03?app_id=mmt


Job started with ID: ali_titan_monarch | 0.2.0rc1 | 03
Job status: Pending
Job launched. You can monitor it using: job.status
To stop the job: job.stop()
To clean up: studio.stop()


In [None]:
from lightning_sdk import Machine, MMT, Status, Studio


def launch_mmt_job_titan_trainer(num_nodes=2, mmt_job_name="", port=26600, num_gpus: int = 8):
    """
    Launch a multi-machine training job using Lightning SDK's MMT API.
    """

    studio = Studio()

    try:
        job = MMT(name=mmt_job_name, _fetch_job=True)

        if job.status == Status("Running") or job.status == Status("Pending"):
            print(
                f"MMT job with {num_nodes} nodes is already created! Returning the the job"
            )
            return job, studio

    except:
        print("Job has not been created by the user")

    # Install the MMT plugin befor running the actual job
    studio.install_plugin("multi-machine-training")

    print(f"Launching MMT job with {num_nodes} nodes...")

    # Machine with T4 GPUs
    # machine_type = getattr(Machine, f"T4_X_{num_gpus}")

    # Machine with L4 GPUs
    # machine_type = getattr(Machine, f"L4_X_{num_gpus}")

    # Machine with L40S GPUs
    # machine_type = getattr(Machine, f"L40S_X_{num_gpus}")
    # python_command = f"python -c 'from utils import bootstrap; bootstrap({port})'"
    # python_command = "CONFIG_FILE='/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml' ./teamspace/studios/this_studio/torchtitan/run_train.sh"

    # Machine with T4 GPUs
    # machine_type = getattr(Machine, f"T4_X_{num_gpus}")

    # Machine with L4s GPUs
    machine_type = getattr(Machine, f"L4_X_{num_gpus}")

    # Machine with L40S GPUs
    # machine_type = getattr(Machine, f"L40S_X_{num_gpus}")

    python_command = "/teamspace/studios/this_studio/torchtitan/run_train.sh"
    job = MMT.run(
        command=python_command,
        name=mmt_job_name,
        machine=machine_type,
        studio=studio,
        num_machines=num_nodes,
        env={
            "CONFIG_FILE": "/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml",
            "NCCL_SOCKET_IFNAME": "^lo,docker",
            "NCCL_IB_DISABLE": "1",
            "NCCL_P2P_DISABLE": "1",
            "NCCL_DEBUG": "INFO",
            "NCCL_SOCKET_IFNAME": "ens5"
        },
    )

    # os.environ["NCCL_SOCKET_IFNAME"] = "^lo,docker"  # Use network interfaces except lo and docker
    # os.environ["NCCL_IB_DISABLE"] = "1"  # Disable InfiniBand (not available on GCP)
    # os.environ["NCCL_P2P_DISABLE"] = "1"  # Disable P2P (can cause issues on some GCP configs)
    # os.environ["NCCL_DEBUG"] = "INFO"  # Enable debugging to see what NCCL is doing

    print(f"Job started with ID: {job.name}")
    print(f"Job status: {job.status}")

    # Monitor job status
    return job, studio

launch_mmt_job_titan_trainer(mmt_job_name="Titan on GCP (AWS in PATH)")

In [3]:
port = 26600

ip_addresses_list_public = [machine.public_ip for machine in job.machines]
print(ip_addresses_list_public)
worker_addrs = [f"tcp://{ip}:{port}@tcp://0.0.0.0:{port}" for ip in ip_addresses_list_public]
print(worker_addrs)


['34.169.67.140', '34.82.167.13']
['tcp://34.169.67.140:26600@tcp://0.0.0.0:26600', 'tcp://34.82.167.13:26600@tcp://0.0.0.0:26600']


In [4]:
host_mesh = attach_to_workers(
    name="host_mesh", ca="trust_all_connections", workers=worker_addrs
)

proc_mesh = host_mesh.spawn_procs(per_host={"gpus": NUM_GPUS})
await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)

Monarch internal logs are being written to /tmp/alisol/monarch_log.log; execution id alisol_Dec-19_05:38_192


In [5]:
import getpass
def get_job_name(num_hosts: int, num_gpus_per_host: int):
    return f"monarch-{getpass.getuser()}-hosts{num_hosts}-gpus{num_gpus_per_host}"
print(get_job_name(num_hosts=NUM_NODES, num_gpus_per_host=NUM_GPUS))

monarch-alisol-hosts2-gpus8


In [6]:
import os
import sys
import logging
from monarch.actor import ProcMesh, Actor, endpoint, current_rank
import socket
from torchtitan.tools.logging import init_logger, logger
from torchtitan.train import Trainer
from typing import Optional
import torch
from torchtitan.config import JobConfig


class TitanTrainerWrapper(Actor):
    def __init__(self, job_config: JobConfig):
        self.rank = current_rank().rank
        self.job_config = job_config

    def _rprint(self, msg):
        """Helper method to print with rank information."""
        print(f"{self.rank=} {msg}")

    @endpoint
    def init(self):
        logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))
        print(f"Initializing actor: {self.rank} {current_rank()=} {socket.gethostname()=}")


    @endpoint
    def train(self):
        logger.info("Starting training")
        config = self.job_config
        trainer: Optional[Trainer] = None

        try:
            trainer = Trainer(config)
            trainer.train()

            if config.checkpoint.create_seed_checkpoint:
                assert (
                    int(os.environ["WORLD_SIZE"]) == 1
                ), "Must create seed checkpoint using a single device, to disable sharding."
                assert (
                    # config.checkpoint.enable_checkpoint
                    config.checkpoint.enable
                ), "Must enable checkpointing when creating a seed checkpoint."
                trainer.checkpointer.save(curr_step=0, )
                logger.info("Created seed checkpoint")
            else:
                trainer.train()
        finally:
            if trainer:
                trainer.close()

            if torch.distributed.is_initialized():
                torch.distributed.destroy_process_group()
                logger.info("Process group destroyed.")
        print("Done training")

In [7]:
from torchtitan.config import ConfigManager, JobConfig
from monarch.tools.network import AddrType
from monarch.utils import setup_env_for_distributed

async def async_main(job_config: JobConfig):
    torch.use_deterministic_algorithms(True)
    job_name = get_job_name(NUM_NODES, NUM_GPUS)

    """
    # if use_ipaddr is not passed, then default is IPv6 for MASTER_ADDR
    """
    # await setup_env_for_distributed(proc_mesh,)
    await setup_env_for_distributed(proc_mesh, use_ipaddr=AddrType.IPv4)

    await proc_mesh.logging_option(stream_to_client=True, aggregate_window_sec=3)

    print(job_config)
    print(f"Spawning meshes on {job_name}")

    trainer_actor = proc_mesh.spawn("trainer_actor", TitanTrainerWrapper, job_config)

    await trainer_actor.init.call()
    await trainer_actor.train.call()

In [None]:
init_logger()
config_manager = ConfigManager()

job_name = get_job_name(NUM_NODES, NUM_GPUS)

manual_args = [
        "--job.config_file",
        os.path.expanduser("/teamspace/studios/this_studio/torchtitan/torchtitan/models/llama3/train_configs/llama3_8b.toml"),
        "--model.tokenizer-path",
        "/teamspace/studios/this_studio/torchtitan/assets/hf/Llama-3.1-8B",
        "--training.steps",
        "25",
        "--training.dataset",
        "c4_test",
        "--training.dataset_path",
        "/teamspace/studios/this_studio/torchtitan/tests/assets/c4_test",
        "--job.dump_folder",
        "/teamspace/studios/this_studio/torchtitan/outputs/" + job_name,
        "--training.seq_len",
        "1024",
    ]
config = config_manager.parse_args(manual_args)
await async_main(config)

In [None]:
# trainer_actor.stop().get()

In [None]:
host_mesh.shutdown().get()