Skip to content

Commit

Permalink
Fix fast tokenization problems (huggingface#13930)
Browse files Browse the repository at this point in the history
* Fix albert mask token tokenization.

* Ensure special tokans sanitized.

* Style

* Fix

* Apply suggestions from code review
  • Loading branch information
qqaatw authored and Alberto B茅gu茅 committed Jan 27, 2022
1 parent 15263b2 commit 3da4207
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
9 changes: 7 additions & 2 deletions src/transformers/models/albert/tokenization_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,13 @@ def __init__(
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs
) -> None:
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

Expand Down
9 changes: 7 additions & 2 deletions src/transformers/models/albert/tokenization_albert_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,13 @@ def __init__(
mask_token="[MASK]",
**kwargs
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# Mask token behave like a normal word, i.e. include the space before it and
# is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)

super().__init__(
vocab_file,
Expand Down

0 comments on commit 3da4207

Please sign in to comment.