From 4fe3b3bfa12e5fd070b028fc949a3e210830c463 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang@users.noreply.github.com>
Date: Thu, 16 May 2024 19:16:14 +0000
Subject: [PATCH 1/4] Fix changed behavior of pipe_parallel

---
 megatron/neox_arguments/arguments.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index ff4f4bc21..d2422c19c 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -180,7 +180,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None)
         config_files = dict()
         # iterate of all to be loaded yaml files
         for conf_file_name in paths_to_yml_files:
-
             # load file
             with open(conf_file_name) as conf_file:
                 conf = yaml.load(conf_file, Loader=yaml.FullLoader)
@@ -477,7 +476,6 @@ def get_extra_deepspeed_args(self):
         return extra_ds_args
 
     def get_deepspeed_main_args(self):
-
         args_list = list()
 
         if self.autotuning_run is not None:
@@ -796,14 +794,11 @@ def calculate_batch_parameters(
 
         # either none of the three parameters are provided or just gradient_accumulation_step is provided
         else:
-            assert (
-                False
-            ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
+            assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
         return int(train_batch), int(micro_batch), int(grad_acc)
 
     @staticmethod
     def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc):
-
         assert (
             train_batch > 0
         ), f"Train batch size: {train_batch} has to be greater than 0"
@@ -1033,10 +1028,7 @@ def calculate_derived(self):
         # Update 'is pipe parallel' flag
         # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with
         # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
-        self.update_value(
-            "is_pipe_parallel",
-            self.pipe_parallel_size > 1 and self.moe_num_experts == 1,
-        )
+        self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
         if self.moe_num_experts > 1:
             assert not (
                 self.is_pipe_parallel or self.pipe_parallel_size > 1
@@ -1106,8 +1098,8 @@ def calculate_derived(self):
         if "flash" in self.attention_config:
             _flash_version = packaging.version.Version(version("flash-attn"))
             if self.sliding_window_width is not None:
-                assert _flash_version >= packaging.version.Version(
-                    "2.3.0"
+                assert (
+                    _flash_version >= packaging.version.Version("2.3.0")
                 ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention."
             if self.pos_emb == "alibi":
                 if not _flash_version >= packaging.version.Version("2.4.0.post1"):
@@ -1234,7 +1226,6 @@ def validate_values(self):
 
         # Parameters sharing does not work with torch DDP.
         if (self.num_unique_layers is not None) and (self.num_layers is not None):
-
             if not (self.num_unique_layers <= self.num_layers):
                 error_message = (
                     self.__class__.__name__

From 764e4381e3011be8c580b7661983f2c62974111e Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Tue, 21 May 2024 20:45:46 +0000
Subject: [PATCH 2/4] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 47 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index c8e1492ae..fb25563d8 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 6fb840e
+    Default = 4fe3b3b
 
     current git hash of repository
 
@@ -1201,7 +1201,7 @@ Text Generation arguments
 
 
 
-- **num_experts**: int
+- **moe_num_experts**: int
 
     Default = 1
 
@@ -1243,7 +1243,7 @@ Text Generation arguments
 
 - **moe_token_dropping**: bool
 
-    Default = True
+    Default = False
 
     Whether to drop tokens when exceeding capacity
 
@@ -1273,6 +1273,47 @@ Text Generation arguments
 
 
 
+- **moe_type**: str
+
+    Default = megablocks
+
+    Either `deepspeed` or `megablocks`
+
+
+
+- **moe_glu**: bool
+
+    Default = False
+
+    Use gated linear units in MoE
+
+
+
+- **moe_lbl_in_fp32**: bool
+
+    Default = False
+
+    Whether to compute the load balancing loss in fp32.
+
+
+
+- **moe_jitter_eps**: float
+
+    Default = None
+
+    Coefficient for MoE routing jitter. Jitter is
+    not used if set to None
+
+
+
+- **enable_expert_tensor_parallelism**: bool
+
+    Default = False
+
+    Enable expert tensor parallelism
+
+
+
 ## NeoXArgsTokenizer
 
 Tokenizer Arguments

From 56aa2ba81dd2f5c320e2c56b48de9360b801d2f5 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Tue, 21 May 2024 22:51:38 +0000
Subject: [PATCH 3/4] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index c6d369524..a4dc9533c 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 1b85a2f
+    Default = abe5c99
 
     current git hash of repository
 

From 1a27f9d451a7c7f2942f591d14ff23e9a47c4ec1 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 7 Jun 2024 01:26:41 +0000
Subject: [PATCH 4/4] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index f6c3ecde3..7a56e361e 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 7aa0074
+    Default = 8451671
 
     current git hash of repository