In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('''I am going to tokenize this sentence.''')
print(tokens)

['i', 'am', 'going', 'to', 'token', '##ize', 'this', 'sentence', '.']


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('albert-base-v1')
tokens = tokenizer.tokenize('''I am going to tokenize this sentence
                            using albert-base-v1 model's tokenizer.''')
print(tokens)

['▁i', '▁am', '▁going', '▁to', '▁to', 'ken', 'ize', '▁this', '▁sentence', '▁using', '▁albert', '-', 'base', '-', 'v', '1', '▁model', "'", 's', '▁to', 'ken', 'izer', '.']


In [4]:
'''Note that above, the subword based tokenizer 
is used in both tokenizers. 
However to depict the beginning of a word in albert
base tokenizer it uses the prefix _
while, bert base tokenizer uses ##.'''


'Note that above, the subword based tokenizer \nis used in both tokenizers. \nHowever to depict the beginning of a word in albert\nbase tokenizer it uses the prefix _\nwhile, bert base tokenizer uses ##.'

In [5]:
'''
The second task of the tokenization pipeline
is to map those tokens to their respective IDs.
This is done by the convert_tokens_to_ids method.
'''

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('Lets try to tokenize')
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

'''This is why we need to download a file while instantiating
the tokenizer from pretrained method.
We have to make sure we use the same mapping as to when the model
was pretrained
To do this we use the convert tokens to ids method.'''

[11082, 3046, 2000, 19204, 4697]


In [6]:
'''
You may have noticed that the tokens for CLS and SEP 
are missing! Those are the special tokens that are added 
by the prepare for model method. 
The prepare for model method knows which special tokens
to add and where to add them based on the model type.
'''
final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs['input_ids'])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[101, 11082, 3046, 2000, 19204, 4697, 102]


In [8]:
'''
You can look at the special tokens modularly
by decoding the input ids as how the tokenizer 
has changed your text by using the decode method.
'''
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer('Lets try to tokenize')
print(tokenizer.decode(inputs['input_ids']))


[CLS] lets try to tokenize [SEP]


In [11]:
'''Above the bert tokenizer uses the [CLS] and [SEP] tokens
But the roberta tokenizer uses the <s> and </s> tokens'''
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-base')
inputs = tokenizer('Lets try to tokenize.')
print(tokenizer.decode(inputs['input_ids']))

<s>Lets try to tokenize.</s>


There are actually two ways to decode:
- The one shown above
- The one shown below

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('Convert this to tokens.')
input_ids = tokenizer.convert_tokens_to_ids(tokens)
inputs = tokenizer.prepare_for_model(input_ids)
print(inputs)

# decode:
decode = tokenizer.decode(input_ids)
print(decode)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
convert this to tokens.


In [None]:
from transformers import AutoTokenizer

tokenizer =AutoTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer('Convert this to tokens.')
print(inputs)
decode = tokenizer.decode(inputs['input_ids'])
print(decode)

{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] convert this to tokens. [SEP]


In [12]:
'''
Now that you know the intermediate stuff about how 
a tokenizer works, you can forget all that stuff
and only remember that you have to call it on the input 
text.
'''
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer('Lets try to tokenize')
print(inputs)

{'input_ids': [101, 11082, 3046, 2000, 19204, 4697, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [13]:
'''To learn what attention mask is
check out the --batch input together

To learn what the token type ids are
check out --process pairs of sentences  
'''

'To learn what attention mask is\ncheck out the --batch input together\n\nTo learn what the token type ids are\ncheck out --process pairs of sentences  \n'

The primary and easy way to batch inputs together is as follows:

In [2]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentences = ['I have been waiting for a hugging face course my whole life.',
             'I hate this so much',
             'I am not confident.']
tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
print(tokens)

{'input_ids': tensor([[  101,  1045,  2031,  2042,  3403,  2005,  1037, 17662,  2227,  2607,
          2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   102,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1045,  2572,  2025,  9657,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


However, if you wish to batch inputs together from beneath the tokenizer pipeline:

In [4]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentences = ['I have been waiting for a hugging face course my whole life.',
             'I hate this so much',
             'I am not confident.']

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

print(ids)
print(ids[0])
print(ids[1])
print(ids[2])

[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172], [1045, 2572, 2025, 9657, 1012]]
[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 2061, 2172]
[1045, 2572, 2025, 9657, 1012]


In [5]:
'''Trying to create a tensor from the three lists 
in torch or tensorflow will result in an error. This
is because the tensors must be of the same size, i.e. rectangular
This is done by padding. which we will see later on.'''


import torch
print(ids)
input_ids = torch.tensor(ids)

[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172], [1045, 2572, 2025, 9657, 1012]]


ValueError: expected sequence of length 13 at dim 1 (got 5)

In [7]:
'''The padding id provided to the model via training is 0.
One should not try to change it. you can pad your outputs like so:'''

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
pad = tokenizer.pad_token_id
print(pad)

0


In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

sentences = ['I have been waiting for a hugging face course my whole life.',
             'I hate this so much',
             'I am not confident.']

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

print(ids)
print(ids[0])
print(ids[1])
print(ids[2])

pad_ids = [tokenizer.pad_token_id(_) for _ in ids]
print(pad_ids)
ids1 = torch.tensor(pad_ids[0])
ids2 = torch.tensor(pad_ids[1])
ids3 = torch.tensor(pad_ids[2])
all_ids = torch.tensor(ids1, ids2, ids3)

[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 2061, 2172], [1045, 2572, 2025, 9657, 1012]]
[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012]
[1045, 5223, 2023, 2061, 2172]
[1045, 2572, 2025, 9657, 1012]


TypeError: 'int' object is not callable

In [1]:
'''More methods!!!'''
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('This is me practicing')
print(tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)
tokens_2 = tokenizer.convert_ids_to_tokens(input_ids)
print(tokens_2)
strings = tokenizer.convert_tokens_to_string(tokens)
print(strings)

['this', 'is', 'me', 'practicing']
[2023, 2003, 2033, 12560]
['this', 'is', 'me', 'practicing']
this is me practicing
