### 12.1. BERT for Tokenizing

In [None]:
! pip install transformers



In [None]:
from transformers import BertTokenizer

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_sent_length = 25
sample_sent = 'Hello, are you judgemental. No I am incremental.'



In [None]:
sample_sent_plus_special_tokens = '[CLS]' + sample_sent + '[SEP]'
tokenized_sent = bert_tokenizer.tokenize(sample_sent_plus_special_tokens)
print('tokenized_sent', tokenized_sent)

tokenized_sent ['[CLS]', 'hello', ',', 'are', 'you', 'judgement', '##al', '.', 'no', 'i', 'am', 'inc', '##rem', '##ental', '.', '[SEP]']


In [None]:
input_ids = bert_tokenizer.convert_tokens_to_ids(tokenized_sent)
print(input_ids)

[101, 7592, 1010, 2024, 2017, 16646, 2389, 1012, 2053, 1045, 2572, 4297, 28578, 21050, 1012, 102]


In [None]:
pad_length = max_sent_length - len(input_ids)
print(pad_length)

9


In [None]:
input_ids = input_ids + ([0] * pad_length)
print(input_ids)

[101, 7592, 1010, 2024, 2017, 16646, 2389, 1012, 2053, 1045, 2572, 4297, 28578, 21050, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
att_mask = [1] * len(input_ids)
att_mask = att_mask+ ([0] * pad_length)
print(att_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
token_type_ids = [0] * max_sent_length


In [None]:
input_for_bert = {
    "token_ids": input_ids,
    "token_type_ids": token_type_ids,
    "attention_mask": att_mask
} 
print(input_for_bert )

{'token_ids': [101, 7592, 1010, 2024, 2017, 16646, 2389, 1012, 2053, 1045, 2572, 4297, 28578, 21050, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [None]:
input_for_bert= bert_tokenizer.encode_plus(
                        sample_sent,                      
                        add_special_tokens = True, 
                        max_length = max_sent_length, 
                        pad_to_max_length = True, 
                        return_attention_mask = True,
              )
print('encoded', input_for_bert)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


encoded {'input_ids': [101, 7592, 1010, 2024, 2017, 16646, 2389, 1012, 2053, 1045, 2572, 4297, 28578, 21050, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
