EleutherAI · dashstander · Jan 31, 2023 · Jan 31, 2023 · Feb 1, 2023 · Feb 1, 2023
@@ -139,7 +139,7 @@ data/**/*.txt
 data/**/*.gz
 data/**/*.np*
 data/**/*.npy
-checkpoints/
+./checkpoints/
 .vscode/
 *.pt
 *.ckpt

@@ -165,7 +165,7 @@ Or use the 20B tokenizer (for which only a single Vocab file is needed):
 
 (alternatively, you can provide any tokenizer file that can be loaded by Hugging Face's tokenizers library with the `Tokenizer.from_pretrained()` command)
 
-You can now pretokenize your data using `tools/preprocess_data.py`, the arguments for which are detailed below:
+You can now pretokenize your data using `tools/datasets/preprocess_data.py`, the arguments for which are detailed below:
 
 ```
 usage: preprocess_data.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--num-docs NUM_DOCS] --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer} [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy] --output-prefix OUTPUT_PREFIX
@@ -206,7 +206,7 @@ runtime:
 For example:
 
 ```bash
-python tools/preprocess_data.py \
+python tools/datasets/preprocess_data.py \
             --input ./data/mydataset.jsonl.zst \
             --output-prefix ./data/mydataset \
             --vocab ./data/gpt2-vocab.json \
@@ -322,7 +322,7 @@ python  ./tools/convert_sequential_to_hf.py --input_dir /path/to/model/global_st
 Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
 ```bash
 huggingface-cli login
-python ./tools/upload.py
+python ./tools/checkpoints/upload.py
 ```
 and input the requested information, including HF hub user token.
 

@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = d3e481c
+    Default = 70284f1
 
     current git hash of repository
 
@@ -1906,6 +1906,14 @@ Args for deepspeed config
 
 
 
+- **load_universal**: bool
+
+    Default = False
+
+    Flag for whether the checkpoint to be loaded is a universal checkpoint.
+
+
+
 ## NeoXArgsDeepspeedRunner
 
 Args for deepspeed runner (deepspeed.launcher.runner).

@@ -40,10 +40,28 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         ) or (
             neox_args.weight_decay == 0.0
         ):  # also include all parameters here if no weight decay is being done
-            no_weight_decay_params["params"].extend(
-                [p for p in list(module_._parameters.values()) if p is not None]
-            )
+            # no_weight_decay_params["params"].extend(
+            #    [p for p in list(module_._parameters.values()) if p is not None]
+            # )
+            params = []
+            for n, p in module_._parameters.items():
+                if p is not None:
+                    p.module_name = f"{module_._get_name()}.{n}"
+                    params.append(p)
+            no_weight_decay_params["params"].extend(params)
         else:
+            wd_params = []
+            nwd_params = []
+            for n, p in module_._parameters.items():
+                if p is not None:
+                    p.module_name = f"{module_._get_name()}.{n}"
+                    if n != "bias":
+                        wd_params.append(p)
+                    else:
+                        nwd_params.append(p)
+            weight_decay_params["params"].extend(wd_params)
+            no_weight_decay_params["params"].extend(nwd_params)
+            """
             weight_decay_params["params"].extend(
                 [
                     p
@@ -58,6 +76,8 @@ def get_params_for_weight_decay_optimization(module, neox_args):
                     if p is not None and n == "bias"
                 ]
             )
+            """
+
     if neox_args.weight_decay == 0.0:
         # only return a single param group
         # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once.

@@ -277,6 +277,9 @@ class NeoXArgsDeepspeedConfig(NeoXArgsTemplate):
     autotuning: dict = None
     """Dictionary as described in DeepSpeed autotuning documentation: https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning"""
 
+    load_universal: bool = False
+    """Flag for whether the checkpoint to be loaded is a universal checkpoint."""
+
 
 @dataclass
 class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):

@@ -610,6 +610,53 @@ def get_learning_rate_scheduler(optimizer, neox_args):
     return lr_scheduler
 
 
+from collections import OrderedDict
+import json
+
+
+def log_bit16_groups(optimizer, param_names, zero_stage):
+
+    """Returns a dict of name to shape mapping, only for the flattened fp32 weights saved by the
+    optimizer. the names are exactly as in state_dict. The order is absolutely important, since
+    the saved data is just flattened data with no identifiers and requires reconstruction in the
+    same order it was saved.
+    We can't rely on self.module.named_parameters() to get the saved tensors, as some params
+    will be missing and others unsaved and then it'd be impossible to reconstruct state_dict
+    from the flattened weights.
+    optimizer.bit16_groups seems to be the easiest to use as it's in all zeroX versions.
+    """
+    param_group_shapes = []
+    cnt = 0
+    numel = 0
+
+    # zero2 started using a round_robin_bit16_groups which is a shuffled version of bit16_groups -
+    # if we don't use it, we get parameters ordered incorrectly
+    if hasattr(optimizer, "round_robin_bit16_groups"):
+        bit16_groups = optimizer.round_robin_bit16_groups
+    else:
+        bit16_groups = (
+            optimizer.bit16_groups if zero_stage == 2 else optimizer.fp16_groups
+        )
+
+    for bit16_group in bit16_groups:
+        param_shapes = OrderedDict()
+        for param in bit16_group:
+            cnt += 1
+            numel += param.ds_numel if hasattr(param, "ds_numel") else param.numel()
+            shape = param.ds_shape if hasattr(param, "ds_shape") else param.shape
+            if param not in param_names:
+                raise ValueError(f"failed to find optimizer param in named params")
+            name = param_names[param]
+            param_shapes[name] = shape
+
+            # uncomment to debug zero_to_fp32.py problems
+            # if self.global_rank == 0: print(f"saving param {name} {shape} (numel={shape.numel()})")
+        param_group_shapes.append(param_shapes)
+    # if self.global_rank == 0: print(f"Total saved {numel} numels in {cnt} params")
+
+    return param_group_shapes
+
+
 def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     """Setup model and optimizer."""
     model = get_model(neox_args=neox_args, use_cache=use_cache)
@@ -637,6 +684,11 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
             # config_params=neox_args.deepspeed_config,
             mpu=mpu if not neox_args.is_pipe_parallel else None,
         )
+        zero_stage = neox_args.zero_optimization["stage"]
+        # bit16_groups = log_bit16_groups(optimizer, model.param_names, zero_stage)
+        bit16_groups = model._get_zero_param_shapes()
+        with open(f"zero{zero_stage}.json", mode="w") as jfile:
+            json.dump(bit16_groups, jfile)
         model.total_params = get_total_params(model.module)
         print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')
 

@@ -0,0 +1,15 @@
+# GPT-NeoX Auxillery Tools
+
+This directory contains a number of auxillery tools that are useful for working with GPT-NeoX but not part of the main training code.
+
+## Bash
+
+This directory contains some simple, frequently used bash commands to make working on multiple machines easier.
+
+## Checkpoints
+
+This directory contains tools for manipulating and converting checkpoints including changing the parallelism settings of a pretrained model, converting between GPT-NeoX and the transformers library, and updating checkpoints trained with Version 1.x of this library to be compatible with Version 2.x.
+
+## Datasets
+
+This directory contains tools for downloading and preprocessing datasets to the format expected by the GPT-NeoX library.
@@ -0,0 +1,50 @@
+# GPT-NeoX Checkpoint Manipulation Tools
+
+## Checkpoint Conversion
+
+The default format Deepspeed checkpoints are saved in is dependent on the model and pipeline parallelism settings of the training run. Running a model on a cluster with a different number or type of GPUs is difficult. We have adapted a set of scripts developed by [BigScience](https://github.com/bigscience-workshop/Megatron-DeepSpeed/tree/main/tools/convert_checkpoint) to make this easier.
+
+### DeeperSpeed to universal
+
+To convert your checkpoint to the universal checkpoint format run the `ds_to_universal.py` script with a command along these lines.
+
+```bash
+CURR_CKPT="/path/to/your/old/checkpoint"
+NEW_CKPT="/path/where/you/want/the/new/checkpoint"
+CFG="/path/to/model/config/file"
+
+python3 tools/ds_to_universal.py \
+    --input_folder $CURR_CKPT \
+    --output_folder $NEW_CKPT \
+    --config $CFG
+```
+
+To then run the model from your new checkpoint, add these lines to a new config and run your model like you normally would.
+
+```json
+{
+    "load": "/path/where/you/want/the/new/checkpoint",
+    "load_universal": true
+}
+```
+
+### DeeperSpeed to DeeperSpeed Reshaping
+
+To reshape a DeeperSpeed checkpoint to _reduce_ the parallelism settings, you can use the `deepspeed_to_deepspeed.py` script. It does not work if you would like to re-shard a model to increase the amount of tensor or pipeline parallelism. But if you would like to decrease the amount of parallelism you can run the script with a command like the one below.
+
+```bash
+CURR_CKPT="/path/to/your/old/checkpoint"
+NEW_CKPT="/path/where/you/want/the/new/checkpoint"
+CFG="/path/to/model/config/file"
+TP=1 # Tensor (model) parallelism setting for the new checkpoint, must be less than or equal to the model's original tensor parallelism
+DP=1 # Data parallelism setting for the new checkpoint
+PP=1 # Model parallelism setting for the new checkpoint, must be less than or equal to the model's original pipeline parallelism
+
+python3 tools/deepspeed_to_deepspeed.py \
+    --input_folder $CURR_CKPT \
+    --output_folder $NEW_CKPT \
+    --config $CFG \
+    --target_tp $TP \
+    --target_dp $DP \
+    --target_pp $PP
+```