In [1]:
%cd /mnt/models/mreso/torchtitan/

/mnt/models/mreso/torchtitan


### 2. Define your Titan and cluster parameters

In [2]:
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

from torchtitan.train import Trainer
from torchtitan.config import ConfigManager, JobConfig
from monarch.actor import Actor, current_rank, endpoint
from torchtitan.tools.logging import init_logger, logger
import torch.distributed as dist
import torch
from dataclasses import dataclass
import os
from monarch.tools import commands
from monarch.utils import setup_env_for_distributed


@dataclass
class RunParams:
    """
        Parameters for your cluster and training job, adjust as needed
    """
    training_steps: int = 50
    model_config = "/mnt/models/mreso/torchtitan/torchtitan/models/llama3/train_configs/debug_model.toml"
    dataset = "c4_test"
    num_nodes = 1
    gpus_per_node = 1


class TrainerActor():
    """
        A simple wrapper class with executes a TorchTitan trainer in a Monarch actor
    """
    def __init__(self, job_config: JobConfig) -> None:
        self.job_config = job_config
        rank = 0
        self.uid = f"[trainer_{rank}]"

    def start_training(self) -> None:
        init_logger()
        trainer: Trainer | None = None

        try:
            trainer = Trainer(self.job_config)
            logger.info(f"{self.uid} initialized successfully and starting training")
            trainer.train()
        except Exception:
            if trainer:
                trainer.close()
            raise
        else:
            trainer.close()
        finally:
            torch.distributed.destroy_process_group()
            logger.info(f"{self.uid} trainer cleaned up")

def make_job_config() -> JobConfig:
    """
        Create a job config which is digested by TorchTitan, sourced from RunParams
    """
    data_parallel_shard_degree = RunParams.num_nodes * RunParams.gpus_per_node
    output_path = "./outputs"

    script_dir = globals()['_dh'][0]
    default_args = [
        "--job.config_file",
        os.path.join(script_dir, RunParams.model_config),
        "--model.tokenizer_path",
        os.path.join(script_dir, "examples/assets/hf/Llama-3.1-8B/"),
        "--comm.trace_buf_size",
        "0",
        "--metrics.log_freq",
        "1",
        "--parallelism.data_parallel_shard_degree",
        str(data_parallel_shard_degree),
        "--activation_checkpoint.mode",
        "full",
        "--comm.train_timeout_seconds",
        "60",
        "--training.steps",
        str(RunParams.training_steps),
        "--training.dataset",
        RunParams.dataset,
        "--job.dump_folder",
        output_path,
        "--metrics.enable_tensorboard",
    ]

    config_manager = ConfigManager()
    job_config = config_manager.parse_args(default_args)

    return job_config

In [None]:
env = {
    "MASTER_ADDR": "localhost",
    "MASTER_PORT": str(27000),
    "RANK": str(0),
    "LOCAL_RANK": str(0),
    "LOCAL_WORLD_SIZE": str(1),
    "WORLD_SIZE": str(1),
}

os.environ.update(env)

: 

In [None]:

trainer = TrainerActor(make_job_config())

print(f"{torch.__version__}")

# trainer.start_training()

tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.


[titan] 2025-10-02 01:09:17,333 - root - INFO - Starting job: Llama 3 debug training
[titan] 2025-10-02 01:09:17,335 - root - INFO - Building 0-D device mesh with [], []
[titan] 2025-10-02 01:09:17,359 - root - INFO - [GC] Initial GC collection took 0.00 seconds
[titan] 2025-10-02 01:09:17,728 - root - INFO - Loading tokenizer from tokenizer.json
[titan] 2025-10-02 01:09:17,873 - root - INFO - Preparing c4_test dataset from tests/assets/c4_test
[titan] 2025-10-02 01:09:17,891 - root - INFO - Building llama3 debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2048, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)
[titan] 2025-10-02 01:09:17,897 - root - INFO - TensorBoard logging enabled. Logs will be saved at ./outputs/tb/20251002-0109
[titan] 2025-10-02 0

: 

: 