In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer("Let's try to tokenize!")
inputs["input_ids"]

[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]

## What happens into the AutoTokenizer API pipeline ?

![Alt text](image-4.png)

### Step 1: Split the text into tokens

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Let's try to tokenize!")
tokens

['let', "'", 's', 'try', 'to', 'token', '##ize', '!']

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert-base-v1") # Albert Tokenizer using _ for every space.
tokens = tokenizer.tokenize("Let's try to tokenize!")
tokens

['▁let', "'", 's', '▁try', '▁to', '▁to', 'ken', 'ize', '!']

### Step 2: Convert tokens to ID'S

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Let's try to tokenize!")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids

[2292, 1005, 1055, 3046, 2000, 19204, 4697, 999]

### Step 3: Add special tokens for final inputs for Model

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Let's try to tokenize!")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
final_inputs = tokenizer.prepare_for_model(input_ids)
final_inputs["input_ids"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]

In [10]:
# For Decode

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer("Let's try to tokenize!")
tokenizer.decode(inputs["input_ids"])

"[CLS] let's try to tokenize! [SEP]"

In [12]:
# For Decode with Roberta

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
inputs = tokenizer("Let's try to tokenize!")
tokenizer.decode(inputs['input_ids']) # This is using <s> for special tokens.

"<s>Let's try to tokenize!</s>"

In [None]:
# Done.