EleutherAI · DayOfThePenguin · Mar 27, 2024 · Mar 27, 2024 · Mar 28, 2024 · Mar 28, 2024
@@ -11,7 +11,7 @@
    # across the node boundaries )
    "pipe_parallel_size": 1,
    "model_parallel_size": 1,
-   "moe_expert_parallel_size": 1,
+   "moe_deepspeed_expert_parallel_size": 1,
 
    # model settings
    "num_layers": 12,

@@ -0,0 +1,83 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 2,
+   "model_parallel_size": 2,
+
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 1024,
+   "num_attention_heads": 16,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+
+   # moe settings
+   "moe_num_experts": 8,
+
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.0,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   "precision": "bfloat16",
+
+   "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
+   # misc. training settings
+   "train_iters": 5,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "min_lr": 0.0006,
+   "warmup": 0.0,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 1,
+   "steps_per_print": 1,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
@@ -1056,14 +1056,6 @@ Parallelism Arguments
 
 
 
-- **expert_interval**: int
-
-    Default = 2
-
-    Have one MoE layer every expert_interval layers
-
-
-
 ## NeoXArgsTemplate
 
 NeoXArgsTemplate()
@@ -1185,93 +1177,6 @@ Text Generation arguments
 
 
 
-- **moe_top_k**: int
-
-    Default = 1
-
-    Activate top K experts in MoE
-
-
-
-- **use_tutel**: bool
-
-    Default = False
-
-    Use Tutel optimizations in MoE
-
-
-
-- **num_experts**: int
-
-    Default = 1
-
-    Number of MoE experts
-
-
-
-- **moe_loss_coeff**: float
-
-    Default = 0.1
-
-    Coefficient for MoE loss
-
-
-
-- **moe_train_capacity_factor**: float
-
-    Default = 1.0
-
-    The capacity of the expert at train time
-
-
-
-- **moe_eval_capacity_factor**: float
-
-    Default = 1.0
-
-    The capacity of the expert at eval time
-
-
-
-- **moe_min_capacity**: int
-
-    Default = 4
-
-    The minimum capacity per expert regardless of the capacity_factor
-
-
-
-- **moe_token_dropping**: bool
-
-    Default = True
-
-    Whether to drop tokens when exceeding capacity
-
-
-
-- **create_moe_param_group**: bool
-
-    Default = True
-
-    Whether to create a separate parameter group for MoE parameters
-
-
-
-- **moe_use_residual**: bool
-
-    Default = True
-
-    Whether to use residual in MoE
-
-
-
-- **moe_expert_parallel_size**: int
-
-    Default = 1
-
-    Number of parallel experts in MoE
-
-
 
 ## NeoXArgsTokenizer
 
@@ -2302,3 +2207,140 @@ Args for deepspeed runner (deepspeed.launcher.runner).
 
     Adds a `--account` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometimes necessary for cluster rules, or so I've heard.
 
+## NeoXArgsMoE
+
+Args for Mixture of Experts configuration
+
+
+- **moe_num_experts**: int
+
+    Default = 1
+
+    The number of experts in MoE layers. MoE
+    layers not used if set to 1
+
+
+
+- **moe_expert_interval**: int
+
+    Default = 1
+
+    Have one MoE layer every expert_interval layers
+
+
+- **moe_top_k**: int
+
+    Default = 1
+
+    The number of experts each token is routed to
+    in MoE layers.
+
+
+
+- **moe_router_type**: typing.Literal['sinkhorn', 'topk']
+
+    Default = 'sinkhorn'
+
+    What token routing algorithm to use.
+
+
+
+- **moe_lbl_in_fp32**: bool
+
+    Default = 0.1
+
+    Whether to compute the load balancing loss in fp32.
+
+
+
+- **moe_jitter_eps**: float
+
+    Default = None
+
+    Coefficient for MoE routing jitter. Jitter is 
+    not used if set to None
+
+
+
+- **use_deepspeed_moe**: bool
+
+    Default = False
+
+    Whether to use legacy deepspeed token dropping MoE implementation.
+
+
+- **use_tutel**: bool
+
+    Default = False
+
+    Use Tutel optimizations in MoE
+    ONLY USED by DeepSpeed MoE
+
+- **moe_loss_coeff**: float
+
+    Default = 0.1
+
+    Coefficient for MoE loss. Only used for routing functions like top_k that aren't self-balancing
+
+
+
+- **moe_deepspeed_train_capacity_factor**: float
+
+    Default = 1.0
+
+    The capacity of the expert at train time
+    ONLY USED by DeepSpeed MoE
+
+
+
+- **moe_deepspeed_eval_capacity_factor**: float
+
+    Default = 1.0
+
+    The capacity of the expert at eval time
+    ONLY USED by DeepSpeed MoE
+
+
+
+- **moe_deepspeed_min_capacity**: int
+
+    Default = 4
+
+    The minimum capacity per expert regardless of the capacity_factor
+    ONLY USED by DeepSpeed MoE
+
+
+
+- **moe_deepspeed_token_dropping***: bool
+
+    Default = True
+
+    Whether to drop tokens when exceeding capacity.
+    ONLY USED by DeepSpeed MoE
+
+
+
+- **create_deepspeed_moe_param_group**: bool
+
+    Default = True
+
+    Whether to create a separate parameter group for MoE parameters.
+    ONLY USED by DeepSpeed MoE
+
+
+
+- **moe_deepspeed_use_residual**: bool
+
+    Default = True
+
+    Whether to use residual in MoE
+    ONLY USED by DeepSpeed MoE
+
+
+
+- **moe_deepspeed_expert_parallel_size**: int
+
+    Default = 1
+
+    Number of parallel experts in MoE.
+    ONLY USED by DeepSpeed MoE; dMoE uses model parallel group for expert parallelism