## Monarch + TorchTitan on SLURM
This example notebook demonstrates how you can easily run and iterate on a distributed training job with Monarch and TorchTitan.

#### Prerequisites
Please make sure your environment is setup for this notebook:
1. Install Monarch nightly: https://github.com/meta-pytorch/monarch/blob/main/scripts/install_nightly.py
2. Install Titan nightly: https://github.com/pytorch/torchtitan?tab=readme-ov-file#nightly-builds
3. Ensure you have a valid Titan model config in the script directory (i.e: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/train_configs/debug_model.toml)

### 1. Create your SLURM job
Configure parameters for your cluster:
- num_nodes: Number of nodes to allocate (default: 2)
- gpus_per_node: Number of GPUs per node (default: 8)
- mesh_name: Name for the mesh (default: "mesh0")
- time_limit: Maximum job duration (default: "06:00:00")

In [None]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

import logging
from monarch.job import SlurmJob

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(name)s %(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    force=True,
)
logger: logging.Logger = logging.getLogger(__name__)

# Configure job parameters
num_nodes = 2  # assign for your system
gpus_per_node = 8  # adjust for your hardware
mesh_name = "mesh0"

# Create a SLURM job with N nodes


import os
MONARCH_EXAMPLE_FOLDER=os.getcwd()
os.environ["MONARCH_EXAMPLE_FOLDER"]=MONARCH_EXAMPLE_FOLDER

slurm_job = SlurmJob(
    meshes={mesh_name: num_nodes},
    job_name="monarch_example",
    gpus_per_node=gpus_per_node,
    time_limit="06:00:00",
    python_exe=f'./custom_bootstrap_exec.sh',
)

### 2. Define your Titan and cluster parameters

In [3]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

from torchtitan.train import Trainer
from torchtitan.config import ConfigManager, JobConfig
from monarch.actor import Actor, current_rank, endpoint
from torchtitan.tools.logging import init_logger, logger
import torch
from dataclasses import dataclass
import os
from monarch.utils import setup_env_for_distributed


@dataclass
class RunParams:
    """
        Parameters for your cluster and training job, adjust as needed
    """
    training_steps: int = 50
    model_config = "../../torchtitan/torchtitan/models/llama3/train_configs/debug_model.toml"
    dataset = "c4"
    num_nodes = num_nodes
    gpus_per_node = gpus_per_node


class TrainerActor(Actor):
    """
        A simple wrapper class with executes a TorchTitan trainer in a Monarch actor
    """
    def __init__(self, job_config: JobConfig) -> None:
        self.job_config = job_config
        rank = current_rank().rank
        self.uid = f"[trainer_{rank}]"

    @endpoint
    async def start_training(self) -> None:
        init_logger()
        trainer: Trainer | None = None

        try:
            trainer = Trainer(self.job_config)
            logger.info(f"{self.uid} initialized successfully and starting training")
            trainer.train()
        except Exception:
            if trainer:
                trainer.close()
            raise
        else:
            trainer.close()
        finally:
            torch.distributed.destroy_process_group()
            logger.info(f"{self.uid} trainer cleaned up")

def make_job_config() -> JobConfig:
    """
        Create a job config which is digested by TorchTitan, sourced from RunParams
    """
    data_parallel_shard_degree = RunParams.num_nodes * RunParams.gpus_per_node
    output_path = "./outputs"

    script_dir = globals()['_dh'][0]
    default_args = [
        "--job.config_file",
        os.path.join(script_dir, RunParams.model_config),
        #"--model.tokenizer_path",
        "--model.hf_assets_path",
        f"{MONARCH_EXAMPLE_FOLDER}/../../torchtitan/tests/assets/tokenizer/",
        "--comm.trace_buf_size",
        "0",
        "--metrics.log_freq",
        "1",
        "--parallelism.data_parallel_shard_degree",
        str(data_parallel_shard_degree),
        "--activation_checkpoint.mode",
        "full",
        "--comm.train_timeout_seconds",
        "60",
        "--training.steps",
        str(RunParams.training_steps),
        "--training.dataset",
        RunParams.dataset,
        "--job.dump_folder",
        output_path,
        "--metrics.enable_tensorboard",
    ]

    config_manager = ConfigManager()
    job_config = config_manager.parse_args(default_args)

    return job_config

  from .autonotebook import tqdm as notebook_tqdm


### 3. Execute your training job
You can make adjustments and run this on the existing SLURM allocations as many times as you would like!

In [4]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

async def main():
    job_config = make_job_config()

    try:
        # 1. Get job state and create process mesh
        job_state = slurm_job.state()
        proc_mesh = job_state.mesh0.spawn_procs({"gpus": RunParams.gpus_per_node})
        
        # 2. Configure remote logging behavior
        await proc_mesh.logging_option(
            stream_to_client=True,
            # aggregate_window_sec=None  # Uncomment to disable log batching
        )
        
        # 3. Setup environment for torch.distributed
        await setup_env_for_distributed(proc_mesh)
        
        # 4. Spawn TrainerActor on each GPU
        trainer = proc_mesh.spawn("trainer_actor", TrainerActor, job_config)
        
        # 5. Execute the training job
        await trainer.start_training.call()
        
        logger.info("Training completed successfully!")
        
    except Exception as e:
        logger.error(f"Training workflow failed: {e}")


if __name__ == "__main__":
    await main()

Found cached job at path: .monarch/job_state.pkl
SLURM job 864 not found in queue
Cached job cannot run this spec, removing cache
Cancelled SLURM job 864
Applying current job
Submitting SLURM job with 1 nodes
SLURM job 865 submitted. Logs will be written to: /home/chcai/monarch/examples/slurm_865_monarch_example_845268.out
Saving job to cache at .monarch/job_state.pkl
Job has started, connecting to current state


SLURM job 865 is running on 1 nodes: ['chi-mi325x-pod2-103']
Monarch internal logs are being written to /tmp/chcai/monarch_log.log


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:12,892 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] Starting job: Llama 3 debug training
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:12,894 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] Building 1-D device mesh with ['dp_shard'], [8]
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:12,916 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] [GC] Initial GC collection took 0.00 seconds
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7] [titan] 2025-11-20 22:01:12,892 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 7/8}>] Starting job: Llama 3 debug training
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:12,892 - root 

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] [titan] 2025-11-20 22:01:13,486 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 3/8}>] Building 1-D device mesh with ['dp_shard'], [8]
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0] [titan] 2025-11-20 22:01:13,488 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 0/8}>] Building 1-D device mesh with ['dp_shard'], [8]
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] [titan] 2025-11-20 22:01:13,513 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 3/8}>] [GC] Initial GC collection took 0.00 seconds
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0] [titan] 2025-11-20 22:01:13,513 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 0/8}>] [GC] Initial GC collection took 0.00 seconds


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] chi-mi325x-pod2-103:1047:2346 [3] NCCL INFO ROCr version 1.18
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] chi-mi325x-pod2-103:1047:2346 [3] NCCL INFO Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] chi-mi325x-pod2-103:1047:2346 [3] NCCL INFO Kernel version: 5.15.0-160-generic
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] chi-mi325x-pod2-103:1047:2346 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to enp49s0f0np0
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] chi-mi325x-pod2-103:1047:2346 [3] NCCL INFO Bootstrap: Using enp49s0f0np0:45.76.227.146<0>
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] chi-mi325x-pod2-103:1047:2346 [3] NCCL INFO RCCL version : 2.26.6-HEAD:64f48b6
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] HIP version  : 7.0.51831-7c9236b16
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] ROC



[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] chi-mi325x-pod2-103:1047:2912 [3[titan] 2025-11-20 22:01:23,374 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 3/8}>] Loading tokenizer from tokenizer.json
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2] chi-mi325x-pod2-103:7[titan] 2025-11-20 22:01:23,374 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 2/8}>] Loading tokenizer from tokenizer.json
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] chi-mi325[titan] 2025-11-20 22:01:23,374 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] Loading tokenizer from tokenizer.json
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7] chi-mi325[titan] 2025-11-20 22:01:23,374 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 7/8}>] Loading tokenizer from tokenizer.json
[chi-mi325x-pod2-103.ord.vultr.cpe.ice

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2]   return _bootstrap._gcd_import(name[level:], package, level)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4]   return _bootstrap._gcd_import(name[level:], package, level)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4] [titan] 2025-11-20 22:01:26,406 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 4/8}>] Building llama3 debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2048, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, rope_scaling_args=RoPEScalingArgs(scaling_factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_position_embeddings=8192), max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2] [titan] 2025-11-20 22:01:26,408 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 2/8}>] Building llama3 debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_laye

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2] [titan] 2025-11-20 22:01:26,463 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 2/8}>] [34mModel llama3 debugmodel [31msize: 6,163,712 total parameters[39m
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2] [titan] 2025-11-20 22:01:26,463 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 2/8}>] Applied full activation checkpointing to the model


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4] [titan] 2025-11-20 22:01:26,471 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 4/8}>] Applied FSDP to the model
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2] [titan] 2025-11-20 22:01:26,474 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 2/8}>] Applied FSDP to the model


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5]   return _bootstrap._gcd_import(name[level:], package, level)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:01:26,747 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 5/8}>] Building llama3 debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2048, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, rope_scaling_args=RoPEScalingArgs(scaling_factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_position_embeddings=8192), max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:01:26,754 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 5/8}>] CUDA capacity: AMD Instinct MI325X with 255.98GiB memory
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:01:26,801 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 5/8}>] Applied FSDP to the model


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0]   return _bootstrap._gcd_import(name[level:], package, level)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0] [titan] 2025-11-20 22:01:26,873 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 0/8}>] Building llama3 debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2048, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, rope_scaling_args=RoPEScalingArgs(scaling_factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_position_embeddings=8192), max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0] [titan] 2025-11-20 22:01:26,879 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 0/8}>] TensorBoard logging enabled. Logs will be saved at ./outputs/tb/20251120-2201
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0] [tit

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0] [titan] 2025-11-20 22:01:26,930 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 0/8}>] Applied FSDP to the model


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6]   return _bootstrap._gcd_import(name[level:], package, level)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:27,307 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] Building llama3 debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2048, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, rope_scaling_args=RoPEScalingArgs(scaling_factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_position_embeddings=8192), max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:27,312 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] CUDA capacity: AMD Instinct MI325X with 255.98GiB memory
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:27,362 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] Applied FSDP to the model
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4] [titan] 2025-11-20 22:01:27,366 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 4/8}>] Peak FLOPS used for computing MFU: 1.300e+15
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4] [titan] 2025-11-20 22:01:27,366 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 4/8}>] CUDA memory usage for model: 0.00GiB(0.00%)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4] [titan] 2025-11-20 22:01:27,368 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 4/8}>] Mixed precision training is handled by fully_shard
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4] [titan] 2025-11-20 22:01:27,368 - roo

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1]   return _bootstrap._gcd_import(name[level:], package, level)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:01:27,423 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 5/8}>] Mixed precision training is handled by fully_shard
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2] [titan] 2025-11-20 22:01:27,424 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 2/8}>] Peak FLOPS used for computing MFU: 1.300e+15
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2] [titan] 2025-11-20 22:01:27,424 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 2/8}>] CUDA memory usage for model: 0.00GiB(0.00%)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:01:27,424 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 5/8}>] Trainer is initialized with local batch size 8, global batch size 64, gradient accumulation steps 1, sequence length 2048,

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:27,534 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] Applied FSDP to the model


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3]   return _bootstrap._gcd_import(name[level:], package, level)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:27,669 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] Peak FLOPS used for computing MFU: 1.300e+15
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:27,670 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] CUDA memory usage for model: 0.00GiB(0.00%)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:27,672 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] Mixed precision training is handled by fully_shard
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6] [titan] 2025-11-20 22:01:27,672 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 6/8}>] Trainer is initialized with local batch size 8, global batch size 64, gradient accumulation steps 1, sequence length 204

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] [titan] 2025-11-20 22:01:27,748 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 3/8}>] Applied FSDP to the model


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7]   return _bootstrap._gcd_import(name[level:], package, level)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:27,839 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] Peak FLOPS used for computing MFU: 1.300e+15
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:27,840 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] CUDA memory usage for model: 0.00GiB(0.00%)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:27,843 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] Mixed precision training is handled by fully_shard
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1] [titan] 2025-11-20 22:01:27,843 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 1/8}>] Trainer is initialized with local batch size 8, global batch size 64, gradient accumulation steps 1, sequence length 2048, t

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7] [titan] 2025-11-20 22:01:27,911 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 7/8}>] Applied FSDP to the model
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] [titan] 2025-11-20 22:01:28,061 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 3/8}>] Peak FLOPS used for computing MFU: 1.300e+15
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] [titan] 2025-11-20 22:01:28,061 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 3/8}>] CUDA memory usage for model: 0.00GiB(0.00%)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] [titan] 2025-11-20 22:01:28,064 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 3/8}>] Mixed precision training is handled by fully_shard
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3] [titan] 2025-11-20 22:01:28,064 - roo

[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 789] [2]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 273] [0]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1821] [6]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 531] [1]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1047] [3]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1305] [4]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5]   sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)


[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 2079] [7] [titan] 2025-11-20 22:01:35,990 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 7/8}>] [31mstep:  1  [32mloss:  8.1300  [38;2;180;60;0mgrad_norm:  1.4516  [38;2;54;234;195mmemory:  0.76GiB(0.30%)  [34mtps: 2,024  [36mtflops: 0.14  [35mmfu: 0.01%[39m
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:01:35,990 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 5/8}>] [31mstep:  1  [32mloss:  8.1300  [38;2;180;60;0mgrad_norm:  1.4516  [38;2;54;234;195mmemory:  0.76GiB(0.30%)  [34mtps: 1,781  [36mtflops: 0.13  [35mmfu: 0.01%[39m
[chi-mi325x-pod2-103.ord.vultr.cpe.ice.amd.com 1563] [5] [titan] 2025-11-20 22:01:35,990 - root - INFO - [actor=<root>.<__main__.TrainerActor trainer_actor{'hosts': 0/1, 'gpus': 5/8}>] Synchronizing and adjusting timeout for all ProcessGroups to 0:01:00
[chi-mi325x-pod2-103.ord.vultr.cpe

root 2025-11-20 22:01:40 INFO [actor=<root>] Training completed successfully!


### 4. Cleanup the SLURM job
Once you're done experimenting, free up the allocation

In [5]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

# Cancel the SLURM job, releasing all reserved nodes back to the cluster
slurm_job.kill()
logger.info("Job terminated successfully")

Cancelled SLURM job 865
root 2025-11-20 22:01:42 INFO [actor=<root>] Job terminated successfully
