diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index 0296dad8bc0943..6caa93db57fda5 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -142,8 +142,13 @@ def __init__( sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs ) -> None: - # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py index 60c9d3144f192a..9eb91ec555e77d 100644 --- a/src/transformers/models/albert/tokenization_albert_fast.py +++ b/src/transformers/models/albert/tokenization_albert_fast.py @@ -135,8 +135,13 @@ def __init__( mask_token="[MASK]", **kwargs ): - # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) super().__init__( vocab_file,