Merge branch 'master' into tapas-tf

Albertobegue · Nov 29, 2021 · fbad9bb · fbad9bb
2 parents 343195d + 25156eb
commit fbad9bb
Show file tree

Hide file tree

Showing 49 changed files with 309 additions and 122 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -279,7 +279,7 @@ jobs:
                       - v0.4-tf-{{ checksum "setup.py" }}
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
             - run: pip install tensorflow_probability
             - save_cache:
                   key: v0.4-tf-{{ checksum "setup.py" }}
@@ -313,7 +313,7 @@ jobs:
                       - v0.4-tf-{{ checksum "setup.py" }}
                       - v0.4-{{ checksum "setup.py" }}
             - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
             - run: pip install tensorflow_probability
             - save_cache:
                   key: v0.4-tf-{{ checksum "setup.py" }}

diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
@@ -205,8 +205,9 @@ jobs:
                   apt -y update && apt install -y libaio-dev
                   pip install --upgrade pip
                   pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
-                  pip install .[testing,deepspeed,fairscale]
-                  pip install git+https://github.com/microsoft/DeepSpeed
+                  rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
+                  pip install .[testing,fairscale]
+                  pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
 
             - name: Are GPUs recognized by our DL frameworks
               run: |
@@ -218,7 +219,7 @@ jobs:
             - name: Run all tests on GPU
               run: |
                   python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
- 
+
             - name: Failure short reports
               if: ${{ always() }}
               run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
@@ -50,7 +50,7 @@ jobs:
           python -c "import torch; print('Cuda version:', torch.version.cuda)"
           python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
           python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-      
+
       - name: Fetch the tests to run
         run: |
           python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
@@ -105,7 +105,7 @@ jobs:
         run: |
           python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
           python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
-      
+
       - name: Fetch the tests to run
         run: |
           python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
@@ -203,7 +203,7 @@ jobs:
           apt install -y libsndfile1-dev
           pip install --upgrade pip
           pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
-      
+
       - name: Launcher docker
         uses: actions/checkout@v2
         with:
@@ -277,7 +277,7 @@ jobs:
 #        run: |
 #          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
 #          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
-#      
+#
 #      - name: Fetch the tests to run
 #        run: |
 #          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
@@ -389,11 +389,11 @@ jobs:
           python -c "import torch; print('Cuda version:', torch.version.cuda)"
           python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
           python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-      
+
       - name: Fetch the tests to run
         run: |
           python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
-      
+
       - name: Report fetched tests
         uses: actions/upload-artifact@v2
         with:
@@ -437,6 +437,7 @@ jobs:
         run: |
           apt -y update && apt install -y libaio-dev
           pip install --upgrade pip
+          rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
           pip install .[testing,deepspeed,fairscale]
 
       - name: Are GPUs recognized by our DL frameworks

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
@@ -143,7 +143,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev git
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -293,7 +293,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev git
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -429,6 +429,7 @@ jobs:
         run: |
           apt -y update && apt install -y libaio-dev
           pip install --upgrade pip
+          rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
           pip install .[testing,deepspeed,fairscale]
 
       - name: Are GPUs recognized by our DL frameworks

diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
@@ -46,6 +46,20 @@ won't be possible on a single GPU.
    parts of DeepSpeed like ``zero.Init`` for ZeRO stage 3 and higher. To tap into this feature read the docs on
    :ref:`deepspeed-non-trainer-integration`.
 
+What is integrated:
+
+Training:
+
+1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 with ZeRO-Infinity (CPU and NVME offload).
+
+Inference:
+
+1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but
+   it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see:
+   :ref:`deepspeed-zero-inference`.
+
+There is also DeepSpeed Inference - this is a totally different technology which uses Tensor Parallelism instead of
+ZeRO (coming soon).
 
 
 
@@ -1628,6 +1642,47 @@ larger multi-dimensional shape, this means that the parameter is partitioned and
 
 
 
+.. _deepspeed-zero-inference:
+
+
+ZeRO Inference
+=======================================================================================================================
+
+ZeRO Inference uses the same config as ZeRO-3 Training. You just don't need the optimizer and scheduler sections. In
+fact you can leave these in the config file if you want to share the same one with the training. They will just be
+ignored.
+
+Otherwise you just need to pass the usual :class:`~transformers.TrainingArguments` arguments. For example:
+
+.. code-block:: bash
+
+    deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
+
+The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever
+for the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states.
+
+Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs:
+
+.. code-block:: bash
+
+    deepspeed examples/pytorch/translation/run_translation.py \
+    --deepspeed tests/deepspeed/ds_config_zero3.json \
+    --model_name_or_path t5-small --output_dir output_dir \
+    --do_eval --max_eval_samples 50 --warmup_steps 50  \
+    --max_source_length 128 --val_max_target_length 128 \
+    --overwrite_output_dir --per_device_eval_batch_size 4 \
+    --predict_with_generate --dataset_config "ro-en" --fp16 \
+    --source_lang en --target_lang ro --dataset_name wmt16 \
+    --source_prefix "translate English to Romanian: "
+
+Since for inference there is no need for additional large memory used by the optimizer states and the gradients you
+should be able to fit much larger batches and/or sequence length onto the same hardware.
+
+
+Additionally DeepSpeed is currently developing a related product called Deepspeed-Inference which has no relationship
+to the ZeRO technology, but instead uses tensor parallelism to scale models that can't fit onto a single GPU. This is a
+work in progress and we will provide the integration once that product is complete.
+
 
 Filing Issues
 =======================================================================================================================

diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
@@ -39,7 +39,8 @@ methods for using all the tokenizers:
 - Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
   tokenizer for easy access and making sure they are not split during tokenization.
 
-:class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``,
+:class:`~transformers.BatchEncoding` holds the output of the
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`'s encoding methods (``__call__``,
 ``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
 tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
 these methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by

diff --git a/docs/source/model_doc/imagegpt.rst b/docs/source/model_doc/imagegpt.rst
@@ -96,10 +96,10 @@ ImageGPTModel
     :members: forward
 
 
-ImageGPTForCausalLM
+ImageGPTForCausalImageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.ImageGPTForCausalLM
+.. autoclass:: transformers.ImageGPTForCausalImageModeling
     :members: forward
 
 

diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst
@@ -51,6 +51,15 @@ The easiest way to use a pretrained model on a given task is to use :func:`~tran
 Let's see how this work for sentiment analysis (the other tasks are all covered in the :doc:`task summary
 </task_summary>`):
 
+Install the following dependencies (if not already installed):
+
+.. code-block:: bash
+
+    ## PYTORCH CODE
+    pip install torch
+    ## TENSORFLOW CODE
+    pip install tensorflow
+
 .. code-block::
 
     >>> from transformers import pipeline
@@ -337,27 +346,42 @@ Once your model is fine-tuned, you can save it with its tokenizer in the followi
 
 .. code-block::
 
-    tokenizer.save_pretrained(save_directory)
-    model.save_pretrained(save_directory)
+    >>> ## PYTORCH CODE
+    >>> pt_save_directory = './pt_save_pretrained'
+    >>> tokenizer.save_pretrained(pt_save_directory)
+    >>> pt_model.save_pretrained(pt_save_directory)
+    >>> ## TENSORFLOW CODE
+    >>> tf_save_directory = './tf_save_pretrained'
+    >>> tokenizer.save_pretrained(tf_save_directory)
+    >>> tf_model.save_pretrained(tf_save_directory)
 
 You can then load this model back using the :func:`~transformers.AutoModel.from_pretrained` method by passing the
 directory name instead of the model name. One cool feature of 🤗 Transformers is that you can easily switch between
-PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow. If you are
-loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TFAutoModel.from_pretrained` like this:
+PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow.
 
-.. code-block::
 
-    from transformers import TFAutoModel
-    tokenizer = AutoTokenizer.from_pretrained(save_directory)
-    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)
+If you would like to load your saved model in the other framework, first make sure it is installed:
+
+.. code-block:: bash
 
-and if you are loading a saved TensorFlow model in a PyTorch model, you should use the following code:
+    ## PYTORCH CODE
+    pip install tensorflow
+    ## TENSORFLOW CODE
+    pip install torch
+
+Then, use the corresponding Auto class to load it like this:
 
 .. code-block::
 
-    from transformers import AutoModel
-    tokenizer = AutoTokenizer.from_pretrained(save_directory)
-    model = AutoModel.from_pretrained(save_directory, from_tf=True)
+    ## PYTORCH CODE
+    >>> from transformers import TFAutoModel
+    >>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+    >>> tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True)
+    ## TENSORFLOW CODE
+    >>> from transformers import AutoModel
+    >>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+    >>> pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True)
+
 
 Lastly, you can also ask the model to return all hidden states and all attention weights if you need them:
 

diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
@@ -27,6 +27,7 @@
 import sys
 import time
 from dataclasses import dataclass, field
+from itertools import chain
 from pathlib import Path
 from typing import Callable, Optional
 
@@ -430,7 +431,7 @@ def tokenize_function(examples):
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.

diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
@@ -25,6 +25,7 @@
 import sys
 import time
 from dataclasses import dataclass, field
+from itertools import chain
 
 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 from pathlib import Path
@@ -453,7 +454,7 @@ def tokenize_function(examples):
         # max_seq_length.
         def group_texts(examples):
             # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
             # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
             # customize this part to your needs.

diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -25,6 +25,7 @@
 import sys
 import time
 from dataclasses import dataclass, field
+from itertools import chain
 from pathlib import Path
 from typing import Dict, List, Optional
 
@@ -563,7 +564,7 @@ def tokenize_function(examples):
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.

diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
@@ -26,6 +26,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Optional
 
 import datasets
@@ -408,7 +409,7 @@ def tokenize_function(examples):
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -27,6 +27,7 @@
 import math
 import os
 import random
+from itertools import chain
 from pathlib import Path
 
 import datasets
@@ -366,7 +367,7 @@ def tokenize_function(examples):
     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
     def group_texts(examples):
         # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
         total_length = len(concatenated_examples[list(examples.keys())[0]])
         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
         # customize this part to your needs.

diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
@@ -26,6 +26,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Optional
 
 import datasets
@@ -432,7 +433,7 @@ def tokenize_function(examples):
         # max_seq_length.
         def group_texts(examples):
             # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
             total_length = len(concatenated_examples[list(examples.keys())[0]])
             # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
             # customize this part to your needs.