Add BlenderbotTokenizerFast (huggingface#13720)

* Add the support for the fast (rust) implementation of BlenbderbotTokenizer * Fix a converter and a typo in a doc * Apply the patil-suraj's suggestion * (Nitpick) Fast tokenization -> Fast Tokenization in doc * Apply the SaulLu's suggestion * Apply Narsil's suggestion to fix test pipelines * Add encoder_no_repeat_ngram_size according to the Narsil's suggestion * Revert the last (unnecessary) commit * Override pipeline config for Blenderbot to allow for larger pos. emb. * make fix-copies
Albertobegue · Jan 27, 2022 · f4dcaa0 · f4dcaa0
1 parent ffff564
commit f4dcaa0
Show file tree

Hide file tree

Showing 12 changed files with 181 additions and 6 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -379,7 +379,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
@@ -81,6 +81,13 @@ BlenderbotTokenizer
     :members: build_inputs_with_special_tokens
 
 
+BlenderbotTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotTokenizerFast
+    :members: build_inputs_with_special_tokens
+
+
 BlenderbotModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -398,6 +398,7 @@
     _import_structure["models.barthez"].append("BarthezTokenizerFast")
     _import_structure["models.bert"].append("BertTokenizerFast")
     _import_structure["models.big_bird"].append("BigBirdTokenizerFast")
+    _import_structure["models.blenderbot"].append("BlenderbotTokenizerFast")
     _import_structure["models.camembert"].append("CamembertTokenizerFast")
     _import_structure["models.deberta"].append("DebertaTokenizerFast")
     _import_structure["models.distilbert"].append("DistilBertTokenizerFast")
@@ -2284,6 +2285,7 @@
         from .models.barthez import BarthezTokenizerFast
         from .models.bert import BertTokenizerFast
         from .models.big_bird import BigBirdTokenizerFast
+        from .models.blenderbot import BlenderbotTokenizerFast
         from .models.blenderbot_small import BlenderbotSmallTokenizerFast
         from .models.camembert import CamembertTokenizerFast
         from .models.clip import CLIPTokenizerFast

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
@@ -893,12 +893,42 @@ def converted(self) -> Tokenizer:
         return tokenizer
 
 
+class BlenderbotConverter(Converter):
+    def converted(self) -> Tokenizer:
+        ot = self.original_tokenizer
+        vocab = ot.encoder
+        merges = list(ot.bpe_ranks.keys())
+
+        tokenizer = Tokenizer(
+            BPE(
+                vocab=vocab,
+                merges=merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"$A:0 {ot.eos_token}:0",
+            special_tokens=[
+                (ot.eos_token, ot.eos_token_id),
+            ],
+        )
+
+        return tokenizer
+
+
 SLOW_TO_FAST_CONVERTERS = {
     "AlbertTokenizer": AlbertConverter,
     "BartTokenizer": RobertaConverter,
     "BarthezTokenizer": BarthezConverter,
     "BertTokenizer": BertConverter,
     "BigBirdTokenizer": BigBirdConverter,
+    "BlenderbotTokenizer": BlenderbotConverter,
     "CamembertTokenizer": CamembertConverter,
     "CLIPTokenizer": CLIPConverter,
     "ConvBertTokenizer": BertConverter,

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -108,7 +108,7 @@
             ),
             ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
             ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
-            ("blenderbot", ("BlenderbotTokenizer", None)),
+            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),

diff --git a/src/transformers/models/blenderbot/__init__.py b/src/transformers/models/blenderbot/__init__.py
@@ -18,14 +18,17 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
     "configuration_blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig"],
     "tokenization_blenderbot": ["BlenderbotTokenizer"],
 }
 
+if is_tokenizers_available():
+    _import_structure["tokenization_blenderbot_fast"] = ["BlenderbotTokenizerFast"]
+
 if is_torch_available():
     _import_structure["modeling_blenderbot"] = [
         "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -48,6 +51,9 @@
     from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
     from .tokenization_blenderbot import BlenderbotTokenizer
 
+    if is_tokenizers_available():
+        from .tokenization_blenderbot_fast import BlenderbotTokenizerFast
+
     if is_torch_available():
         from .modeling_blenderbot import (
             BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,

diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Tokenization class for Blenderbot."""
 
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 
 from ...utils import logging
 from ..roberta.tokenization_roberta import RobertaTokenizer
@@ -58,7 +58,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A Blenderbot sequence has the following format:

diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for Blenderbot."""
+
+from typing import TYPE_CHECKING, List, Optional
+
+from ...utils import logging
+from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
+from .tokenization_blenderbot import BlenderbotTokenizer
+
+
+if TYPE_CHECKING:
+    from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
+    "merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
+    "tokenizer_config_file": {
+        "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
+
+
+class BlenderbotTokenizerFast(RobertaTokenizerFast):
+    r"""
+    Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.BlenderbotFast` is nearly identical to :class:`~transformers.RobertaTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
+    to the beginning of sequences.
+
+    Refer to superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = BlenderbotTokenizer
+
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A Blenderbot sequence has the following format:
+
+        - single sequence: `` X </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Will be ignored
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        return token_ids_0 + [self.eos_token_id]
+
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+        inputs = []
+        for is_user, text in conversation.iter_texts():
+            if is_user:
+                # We need to space prefix as it's being done within blenderbot
+                inputs.append(" " + text)
+            else:
+                # Generated responses should contain them already.
+                inputs.append(text)
+
+        full_string = "  ".join(inputs)
+        input_ids = self.encode(full_string)
+        if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[-self.model_max_length :]
+            logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.")
+        return input_ids
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -47,6 +47,15 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["tokenizers"])
 
 
+class BlenderbotTokenizerFast:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["tokenizers"])
+
+
 class BlenderbotSmallTokenizerFast:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])

diff --git a/tests/test_modeling_blenderbot.py b/tests/test_modeling_blenderbot.py
@@ -137,6 +137,11 @@ def get_config(self):
             pad_token_id=self.pad_token_id,
         )
 
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.max_position_embeddings = 100
+        return config
+
     def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict

diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
@@ -124,6 +124,11 @@ def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extra
             def test(self):
                 if ModelClass.__name__.endswith("ForCausalLM"):
                     tiny_config.is_encoder_decoder = False
+                    if hasattr(tiny_config, "encoder_no_repeat_ngram_size"):
+                        # specific for blenderbot which supports both decoder-only
+                        # encoder/decoder but the test config  only reflects
+                        # encoder/decoder arch
+                        tiny_config.encoder_no_repeat_ngram_size = 0
                 if ModelClass.__name__.endswith("WithLMHead"):
                     tiny_config.is_decoder = True
                 try:

diff --git a/tests/test_tokenization_blenderbot.py b/tests/test_tokenization_blenderbot.py
@@ -16,22 +16,37 @@
 """Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
 import unittest
 
+from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
 from transformers.file_utils import cached_property
-from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer
 
 
 class Blenderbot3BTokenizerTests(unittest.TestCase):
     @cached_property
     def tokenizer_3b(self):
         return BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
 
+    @cached_property
+    def rust_tokenizer_3b(self):
+        return BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
+
     def test_encode_decode_cycle(self):
         tok = self.tokenizer_3b
         src_text = " I am a small frog."
         encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
         decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         assert src_text == decoded
 
+    def test_encode_decode_cycle_rust_tokenizer(self):
+        tok = self.rust_tokenizer_3b
+        src_text = " I am a small frog."
+        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
+        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        assert src_text == decoded
+
     def test_3B_tokenization_same_as_parlai(self):
         assert self.tokenizer_3b.add_prefix_space
         assert self.tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]
+
+    def test_3B_tokenization_same_as_parlai_rust_tokenizer(self):
+        assert self.rust_tokenizer_3b.add_prefix_space
+        assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]