diff --git a/docs/source/index.rst b/docs/source/index.rst index fb945683025a2..6ea06fd6242e7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -379,7 +379,7 @@ Flax), PyTorch, and/or TensorFlow. +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | BigBirdPegasus | ❌ | ❌ | ✅ | ❌ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ -| Blenderbot | ✅ | ❌ | ✅ | ✅ | ❌ | +| Blenderbot | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst index fbed715cb6f0f..179058153f89c 100644 --- a/docs/source/model_doc/blenderbot.rst +++ b/docs/source/model_doc/blenderbot.rst @@ -81,6 +81,13 @@ BlenderbotTokenizer :members: build_inputs_with_special_tokens +BlenderbotTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BlenderbotTokenizerFast + :members: build_inputs_with_special_tokens + + BlenderbotModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index bf163b0f25b28..305e464bd7c90 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -398,6 +398,7 @@ _import_structure["models.barthez"].append("BarthezTokenizerFast") _import_structure["models.bert"].append("BertTokenizerFast") _import_structure["models.big_bird"].append("BigBirdTokenizerFast") + _import_structure["models.blenderbot"].append("BlenderbotTokenizerFast") _import_structure["models.camembert"].append("CamembertTokenizerFast") _import_structure["models.deberta"].append("DebertaTokenizerFast") _import_structure["models.distilbert"].append("DistilBertTokenizerFast") @@ -2284,6 +2285,7 @@ from .models.barthez import BarthezTokenizerFast from .models.bert import BertTokenizerFast from .models.big_bird import BigBirdTokenizerFast + from .models.blenderbot import BlenderbotTokenizerFast from .models.blenderbot_small import BlenderbotSmallTokenizerFast from .models.camembert import CamembertTokenizerFast from .models.clip import CLIPTokenizerFast diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 4dd528c55ed4b..a66015c260d9f 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -893,12 +893,42 @@ def converted(self) -> Tokenizer: return tokenizer +class BlenderbotConverter(Converter): + def converted(self) -> Tokenizer: + ot = self.original_tokenizer + vocab = ot.encoder + merges = list(ot.bpe_ranks.keys()) + + tokenizer = Tokenizer( + BPE( + vocab=vocab, + merges=merges, + dropout=None, + continuing_subword_prefix="", + end_of_word_suffix="", + fuse_unk=False, + ) + ) + + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space) + tokenizer.decoder = decoders.ByteLevel() + tokenizer.post_processor = processors.TemplateProcessing( + single=f"$A:0 {ot.eos_token}:0", + special_tokens=[ + (ot.eos_token, ot.eos_token_id), + ], + ) + + return tokenizer + + SLOW_TO_FAST_CONVERTERS = { "AlbertTokenizer": AlbertConverter, "BartTokenizer": RobertaConverter, "BarthezTokenizer": BarthezConverter, "BertTokenizer": BertConverter, "BigBirdTokenizer": BigBirdConverter, + "BlenderbotTokenizer": BlenderbotConverter, "CamembertTokenizer": CamembertConverter, "CLIPTokenizer": CLIPConverter, "ConvBertTokenizer": BertConverter, diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 38fb07ac189c0..c55206592ca17 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -108,7 +108,7 @@ ), ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), ("blenderbot-small", ("BlenderbotSmallTokenizer", None)), - ("blenderbot", ("BlenderbotTokenizer", None)), + ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")), ("bart", ("BartTokenizer", "BartTokenizerFast")), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/blenderbot/__init__.py b/src/transformers/models/blenderbot/__init__.py index 372dcab23f151..2eea035535efb 100644 --- a/src/transformers/models/blenderbot/__init__.py +++ b/src/transformers/models/blenderbot/__init__.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING -from ...file_utils import _LazyModule, is_tf_available, is_torch_available +from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available _import_structure = { @@ -26,6 +26,9 @@ "tokenization_blenderbot": ["BlenderbotTokenizer"], } +if is_tokenizers_available(): + _import_structure["tokenization_blenderbot_fast"] = ["BlenderbotTokenizerFast"] + if is_torch_available(): _import_structure["modeling_blenderbot"] = [ "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -48,6 +51,9 @@ from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig from .tokenization_blenderbot import BlenderbotTokenizer + if is_tokenizers_available(): + from .tokenization_blenderbot_fast import BlenderbotTokenizerFast + if is_torch_available(): from .modeling_blenderbot import ( BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index b37039ee127ef..e003d8053427e 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -14,7 +14,7 @@ # limitations under the License. """Tokenization class for Blenderbot.""" -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING, List, Optional from ...utils import logging from ..roberta.tokenization_roberta import RobertaTokenizer @@ -58,7 +58,7 @@ class BlenderbotTokenizer(RobertaTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None): + def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A Blenderbot sequence has the following format: diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py new file mode 100644 index 0000000000000..f7835d573c563 --- /dev/null +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -0,0 +1,96 @@ +# coding=utf-8 +# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization class for Blenderbot.""" + +from typing import TYPE_CHECKING, List, Optional + +from ...utils import logging +from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast +from .tokenization_blenderbot import BlenderbotTokenizer + + +if TYPE_CHECKING: + from transformers.pipelines.conversational import Conversation + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_config_file": "tokenizer_config.json", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"}, + "merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"}, + "tokenizer_config_file": { + "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json" + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128} + + +class BlenderbotTokenizerFast(RobertaTokenizerFast): + r""" + Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.BlenderbotFast` is nearly identical to :class:`~transformers.RobertaTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token + to the beginning of sequences. + + Refer to superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning + parameters. + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = BlenderbotTokenizer + + def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Blenderbot sequence has the following format: + + - single sequence: `` X `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`): + Will be ignored + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + return token_ids_0 + [self.eos_token_id] + + def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: + inputs = [] + for is_user, text in conversation.iter_texts(): + if is_user: + # We need to space prefix as it's being done within blenderbot + inputs.append(" " + text) + else: + # Generated responses should contain them already. + inputs.append(text) + + full_string = " ".join(inputs) + input_ids = self.encode(full_string) + if len(input_ids) > self.model_max_length: + input_ids = input_ids[-self.model_max_length :] + logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.") + return input_ids diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py index eb79f72d70724..c5887c55d9160 100644 --- a/src/transformers/utils/dummy_tokenizers_objects.py +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -47,6 +47,15 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["tokenizers"]) +class BlenderbotTokenizerFast: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tokenizers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tokenizers"]) + + class BlenderbotSmallTokenizerFast: def __init__(self, *args, **kwargs): requires_backends(self, ["tokenizers"]) diff --git a/tests/test_modeling_blenderbot.py b/tests/test_modeling_blenderbot.py index 33d5064924d20..9e04ec89d99d9 100644 --- a/tests/test_modeling_blenderbot.py +++ b/tests/test_modeling_blenderbot.py @@ -137,6 +137,11 @@ def get_config(self): pad_token_id=self.pad_token_id, ) + def get_pipeline_config(self): + config = self.get_config() + config.max_position_embeddings = 100 + return config + def prepare_config_and_inputs_for_common(self): config, inputs_dict = self.prepare_config_and_inputs() return config, inputs_dict diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index e64d4b8c09f9f..5cb9fb6ecb550 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -124,6 +124,11 @@ def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extra def test(self): if ModelClass.__name__.endswith("ForCausalLM"): tiny_config.is_encoder_decoder = False + if hasattr(tiny_config, "encoder_no_repeat_ngram_size"): + # specific for blenderbot which supports both decoder-only + # encoder/decoder but the test config only reflects + # encoder/decoder arch + tiny_config.encoder_no_repeat_ngram_size = 0 if ModelClass.__name__.endswith("WithLMHead"): tiny_config.is_decoder = True try: diff --git a/tests/test_tokenization_blenderbot.py b/tests/test_tokenization_blenderbot.py index 6cb4eacfb4b8b..93f29634ef302 100644 --- a/tests/test_tokenization_blenderbot.py +++ b/tests/test_tokenization_blenderbot.py @@ -16,8 +16,8 @@ """Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer.""" import unittest +from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast from transformers.file_utils import cached_property -from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer class Blenderbot3BTokenizerTests(unittest.TestCase): @@ -25,6 +25,10 @@ class Blenderbot3BTokenizerTests(unittest.TestCase): def tokenizer_3b(self): return BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B") + @cached_property + def rust_tokenizer_3b(self): + return BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B") + def test_encode_decode_cycle(self): tok = self.tokenizer_3b src_text = " I am a small frog." @@ -32,6 +36,17 @@ def test_encode_decode_cycle(self): decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] assert src_text == decoded + def test_encode_decode_cycle_rust_tokenizer(self): + tok = self.rust_tokenizer_3b + src_text = " I am a small frog." + encoded = tok([src_text], padding=False, truncation=False)["input_ids"] + decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + assert src_text == decoded + def test_3B_tokenization_same_as_parlai(self): assert self.tokenizer_3b.add_prefix_space assert self.tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]] + + def test_3B_tokenization_same_as_parlai_rust_tokenizer(self): + assert self.rust_tokenizer_3b.add_prefix_space + assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]