From 4fe3b3bfa12e5fd070b028fc949a3e210830c463 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 16 May 2024 19:16:14 +0000 Subject: [PATCH 1/4] Fix changed behavior of pipe_parallel --- megatron/neox_arguments/arguments.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index ff4f4bc21..d2422c19c 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -180,7 +180,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None) config_files = dict() # iterate of all to be loaded yaml files for conf_file_name in paths_to_yml_files: - # load file with open(conf_file_name) as conf_file: conf = yaml.load(conf_file, Loader=yaml.FullLoader) @@ -477,7 +476,6 @@ def get_extra_deepspeed_args(self): return extra_ds_args def get_deepspeed_main_args(self): - args_list = list() if self.autotuning_run is not None: @@ -796,14 +794,11 @@ def calculate_batch_parameters( # either none of the three parameters are provided or just gradient_accumulation_step is provided else: - assert ( - False - ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" + assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" return int(train_batch), int(micro_batch), int(grad_acc) @staticmethod def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc): - assert ( train_batch > 0 ), f"Train batch size: {train_batch} has to be greater than 0" @@ -1033,10 +1028,7 @@ def calculate_derived(self): # Update 'is pipe parallel' flag # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs - self.update_value( - "is_pipe_parallel", - self.pipe_parallel_size > 1 and self.moe_num_experts == 1, - ) + self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1) if self.moe_num_experts > 1: assert not ( self.is_pipe_parallel or self.pipe_parallel_size > 1 @@ -1106,8 +1098,8 @@ def calculate_derived(self): if "flash" in self.attention_config: _flash_version = packaging.version.Version(version("flash-attn")) if self.sliding_window_width is not None: - assert _flash_version >= packaging.version.Version( - "2.3.0" + assert ( + _flash_version >= packaging.version.Version("2.3.0") ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention." if self.pos_emb == "alibi": if not _flash_version >= packaging.version.Version("2.4.0.post1"): @@ -1234,7 +1226,6 @@ def validate_values(self): # Parameters sharing does not work with torch DDP. if (self.num_unique_layers is not None) and (self.num_layers is not None): - if not (self.num_unique_layers <= self.num_layers): error_message = ( self.__class__.__name__ From 764e4381e3011be8c580b7661983f2c62974111e Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 21 May 2024 20:45:46 +0000 Subject: [PATCH 2/4] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 47 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c8e1492ae..fb25563d8 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 6fb840e + Default = 4fe3b3b current git hash of repository @@ -1201,7 +1201,7 @@ Text Generation arguments -- **num_experts**: int +- **moe_num_experts**: int Default = 1 @@ -1243,7 +1243,7 @@ Text Generation arguments - **moe_token_dropping**: bool - Default = True + Default = False Whether to drop tokens when exceeding capacity @@ -1273,6 +1273,47 @@ Text Generation arguments +- **moe_type**: str + + Default = megablocks + + Either `deepspeed` or `megablocks` + + + +- **moe_glu**: bool + + Default = False + + Use gated linear units in MoE + + + +- **moe_lbl_in_fp32**: bool + + Default = False + + Whether to compute the load balancing loss in fp32. + + + +- **moe_jitter_eps**: float + + Default = None + + Coefficient for MoE routing jitter. Jitter is + not used if set to None + + + +- **enable_expert_tensor_parallelism**: bool + + Default = False + + Enable expert tensor parallelism + + + ## NeoXArgsTokenizer Tokenizer Arguments From 56aa2ba81dd2f5c320e2c56b48de9360b801d2f5 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 21 May 2024 22:51:38 +0000 Subject: [PATCH 3/4] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c6d369524..a4dc9533c 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 1b85a2f + Default = abe5c99 current git hash of repository From 1a27f9d451a7c7f2942f591d14ff23e9a47c4ec1 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 7 Jun 2024 01:26:41 +0000 Subject: [PATCH 4/4] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index f6c3ecde3..7a56e361e 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 7aa0074 + Default = 8451671 current git hash of repository