From 10bf78871e214f8d0e3bc8662f968e367587a516 Mon Sep 17 00:00:00 2001 From: Jacob Hatef <74274091+jahatef@users.noreply.github.com> Date: Wed, 15 Nov 2023 22:24:31 -0500 Subject: [PATCH] Update neox_args.py (#1081) * Update neox_args.py These attention configuration options were missing from the docs. This will fix that. * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions --- configs/neox_arguments.md | 4 ++-- megatron/neox_arguments/neox_args.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 1af516669..0f2b3c8fa 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = c0fd5d9 + Default = b18f25c current git hash of repository @@ -334,7 +334,7 @@ Model Arguments The first item in the list specifies the attention type(s), and should be a list of strings. The second item specifies the number of times to repeat those attention types in the full list. - attention type choices: [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird] + attention type choices: [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird, "gmlp", "amlp", "flash"] So a 12 layer network with only global attention could be specified like: [[[`global`], 12]] diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 8d0953da2..957960832 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -175,7 +175,7 @@ class NeoXArgsModel(NeoXArgsTemplate): The first item in the list specifies the attention type(s), and should be a list of strings. The second item specifies the number of times to repeat those attention types in the full list. - attention type choices: [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird] + attention type choices: [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird, "gmlp", "amlp", "flash"] So a 12 layer network with only global attention could be specified like: [[[`global`], 12]]