[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1FOuOnpnnFRxTChVkei2oE1xHTV5fK7ma?usp=sharing)

# Tokenizers

In [13]:
!pip install transformers[sentencepiece] 

In [14]:
from transformers import AutoTokenizer

# Select the checkpoint that will be used in the tokenizer
# Call the from_pretrained tokenizer from the autotokenizer framework
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [15]:
# Print the vocabulary of words that the tokenizer has
print(tokenizer.vocab)



In [16]:
print(f'The vocabulary size is {len(tokenizer.vocab)}')

The vocabulary size is 30522


In [31]:
# Tokenize the sentence
# BERT first transforms all words to lower case 
# CLS = Sentence level Classification
# SEP is used to handle two sentence task, to find if the two sentences are connected or not, and to separate the two sentences.
sentence = 'I like NLP'
print(sentence)
tokens = tokenizer.tokenize(sentence)
print("The tokens are:", tokens)
ids = tokenizer.encode(sentence)
print("The tokens have the following IDs in the vocabulary", ids)
print(tokenizer.decode(ids))

I like NLP
The tokens are: ['i', 'like', 'nl', '##p']
The tokens have the following IDs in the vocabulary [101, 1045, 2066, 17953, 2361, 102]
[CLS] i like nlp [SEP]


In [32]:
print(f'{tokenizer.cls_token} -> {tokenizer.cls_token_id}')
print(f'{tokenizer.sep_token} -> {tokenizer.sep_token_id}')

[CLS] -> 101
[SEP] -> 102


In [33]:
# BERT does not have emojis in ints vocabulary.
'😀' in tokenizer.vocab

False

In [34]:
# Since NLP and the emoji are not split by space they are treated as single word which is unknown to BERT.
sentence = 'I like NLP😀'
tokenizer.tokenize(sentence)

['i', 'like', '[UNK]']

In [35]:
# The tokenizer function encodes the input sentences into numerical values (tokens) and returns a PyTorch tensor 
# that can be used as input to the model.
# In this case, tokenizer(first_sentence, second_sentence, return_tensors='pt') encodes the two sentences as a pair 
# and returns a dictionary containing the encoded input. The return_tensors='pt' argument specifies that the function 
# should return PyTorch tensors.
# The resulting dictionary contains the input IDs, attention mask, and token type IDs as PyTorch tensors.

first_sentence = 'I like NLP.'
second_sentence = 'What about you?'
input = tokenizer(first_sentence, second_sentence, return_tensors='pt')
input

# The input_ids tensor contains the numerical values of the tokens in the two sentences. 
# The attention_mask tensor indicates which elements of the input should be attended to by the model. 
# In this case, all elements are set to 1, indicating that the model should attend to all tokens. 
# The token_type_ids tensor specifies which sentence each token belongs to. 
# In this case, the first sentence is assigned the token type ID 0, and the second sentence is assigned the token type ID 1.

{'input_ids': tensor([[  101,  1045,  2066, 17953,  2361,  1012,   102,  2054,  2055,  2017,
          1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [37]:
# Print only the input_ids
input['input_ids']

tensor([[  101,  1045,  2066, 17953,  2361,  1012,   102,  2054,  2055,  2017,
          1029,   102]])

In [38]:
# Print only the token_type_ids
input['token_type_ids']

tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]])

In [39]:
# Print only the attention_mask
input['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [40]:
first_sentence = 'I like NLP.'
second_sentence = 'What are your thoughts on the subject?'
input = tokenizer([first_sentence, second_sentence], padding=True, return_tensors='pt')
input['attention_mask']

# The attention_mask tensor is used to indicate which tokens in the input sequence should be attended to by the model, 
# and which tokens should be ignored. It has the same shape as the input_ids tensor, where each element is either 0 or 1. 
# A value of 1 indicates that the corresponding token should be attended to by the model, and a value of 0 indicates that 
# the token should be ignored. In the case of sentence-pair inputs, the attention mask is constructed to include both 
# sentences, so that the model attends to both sentences during training and inference.

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])