### Word-based tokenizers (Split by space)


In [30]:
tokenized_text = "I am Wong Fu Lim".split()
print(tokenized_text)

['I', 'am', 'Wong', 'Fu', 'Lim']


### Load a tokenizers


In [31]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [32]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

### Use a tokenizers


In [33]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

### Save a tokenizers


In [34]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

### Create tokens


Notice: ##former is a convention for Word-Piece tokenizers


In [35]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


sequence = "Using a Transformer network is simple"

tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


Notice: _using a tokenizers_ is a convention for sentence piece tokenizers


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")


sequence = "Using a Transformer network is simple"

tokens = tokenizer.tokenize(sequence)

print(tokens)

['▁using', '▁a', '▁transform', 'er', '▁network', '▁is', '▁simple']


### Convert tokens to inputs IDs


In [36]:
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


### Convert tokens to inputs IDs with added (CLS and SEP) tokens


In [None]:
final_inputs = tokenizer.prepare_for_model(ids)
print(final_inputs["input_ids"])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102]


### Decode inputs IDs to tokens


In [None]:
decoded_string = tokenizer.decode(
    [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102])
print(decoded_string)

[CLS] Using a Transformer network is simple [SEP]


### Difference between WordPiece and SentencePiece tokenizers


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
inputs = tokenizer("Let's try to tokenize!")

print(inputs)

print(tokenizer.decode(inputs["input_ids"]))

{'input_ids': [101, 2421, 112, 188, 2222, 1106, 22559, 3708, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] Let's try to tokenize! [SEP]


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
inputs = tokenizer("Let's try to tokenize!")

print(inputs)

print(tokenizer.decode(inputs["input_ids"]))

{'input_ids': [0, 7939, 18, 860, 7, 19233, 2072, 328, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
<s>Let's try to tokenize!</s>


# Exercise


Replicate the two last steps (tokenization and conversion to input IDs) on the input sentences (bert-base-cased)

raw_inputs = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this so much!",
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

print(inputs)

{
'input_ids': tensor([
[ 101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102],
[ 101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0]
]),
'attention_mask': tensor([
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
])
}


In [1]:
from transformers import AutoTokenizer
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokens = tokenizer.tokenize(raw_inputs)
ids = tokenizer.convert_tokens_to_ids(tokens)
final_ids = tokenizer.prepare_for_model(ids)
print(tokens)
print(ids)
print(final_ids['input_ids'])

  from .autonotebook import tqdm as notebook_tqdm
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['I', "'", 've', 'been', 'waiting', 'for', 'a', 'Hu', '##gging', '##F', '##ace', 'course', 'my', 'whole', 'life', '.', 'I', 'hate', 'this', 'so', 'much', '!']
[146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 146, 4819, 1142, 1177, 1277, 106]
[101, 146, 112, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 146, 4819, 1142, 1177, 1277, 106, 102]
