In [3]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentences = [
    'I have been waiting for a hugging face course my whole life.',
    'I hate this so much!',
    'I love this so much!',
    'I am disappointed and my day is ruined'
]

tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
print(tokens)

{'input_ids': tensor([[  101,  1045,  2031,  2042,  3403,  2005,  1037, 17662,  2227,  2607,
          2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1045,  2293,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1045,  2572,  9364,  1998,  2026,  2154,  2003,  9868,   102,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}


In [1]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentences = [
    'I have been waiting for a hugging face course my whole life.',
    'I am very disappointed in you.',
    'I hate this so much!',
    'I am terrified as I am never confident about my skills.'
]

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

print(ids)
print(ids[0])
print(ids[1])
print(ids[2])
print(ids[3])

[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 2572, 2200, 9364, 1999, 2017, 1012], [1045, 5223, 2023, 2061, 2172, 999], [1045, 2572, 10215, 2004, 1045, 2572, 2196, 9657, 2055, 2026, 4813, 1012]]
[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012]
[1045, 2572, 2200, 9364, 1999, 2017, 1012]
[1045, 5223, 2023, 2061, 2172, 999]
[1045, 2572, 10215, 2004, 1045, 2572, 2196, 9657, 2055, 2026, 4813, 1012]


In [2]:
final_inputs = tokenizer.prepare_for_model(ids)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [3]:
print(final_inputs)

{'input_ids': [101, [1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 2572, 2200, 9364, 1999, 2017, 1012], [1045, 5223, 2023, 2061, 2172, 999], [1045, 2572, 10215, 2004, 1045, 2572, 2196, 9657, 2055, 2026, 4813, 1012], 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [4]:
'''trying to create a tensor or numpy array from the list of inputs
will result in an error. This is because the list of inputs is not 
rectangular i.e they are not of equal dimensions'''
import torch
ids = ids
input_ids = torch.tensor(ids)


ValueError: expected sequence of length 13 at dim 1 (got 7)

In [5]:
'''Therefore, you have to pad, here we pad manually. But be sure to
check out dynamic padding which is almost always better on the CPU and
the GPU!'''


'Therefore, you have to pad, here we pad manually. But be sure to\ncheck out dynamic padding which is almost always better on the CPU and\nthe GPU!'

In [6]:
'''The value you use to pad the sentences should not be picked
randomly. Use tokenizer.pad_token_id to get the value of the pad token'''

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
print(tokenizer)
tokenizer.pad_token_id

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


0

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('Convert this to tokens.')
input_ids = tokenizer.convert_tokens_to_ids(tokens)
inputs = tokenizer.prepare_for_model(input_ids)
print(inputs)

# decode:
decode = tokenizer.decode(input_ids)
print(decode)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
convert this to tokens.


In [11]:
from transformers import AutoTokenizer

tokenizer =AutoTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer('Convert this to tokens.')
print(inputs)
decode = tokenizer.decode(inputs['input_ids'])
print(decode)

{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] convert this to tokens. [SEP]


In [4]:
'''Enough revision, now pad them:'''
from transformers import AutoTokenizer

sentences = ['This is a list of sentences',
             'I will try my best to keep it short.',
             'It is hard to learn like this.',
             'I am tired.']

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
inputs = tokenizer.prepare_for_model([input_ids])
print(inputs)



You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, [[2023, 2003, 1037, 2862, 1997, 11746], [1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012], [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012], [1045, 2572, 5458, 1012]], 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}


In [7]:
for i in range(len(inputs)): print(inputs['input_ids'][i])

101
[[2023, 2003, 1037, 2862, 1997, 11746], [1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012], [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012], [1045, 2572, 5458, 1012]]
102


In [11]:
inputs['input_ids'][1]

[[2023, 2003, 1037, 2862, 1997, 11746],
 [1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012],
 [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012],
 [1045, 2572, 5458, 1012]]

In [12]:
inputs['input_ids'][1][0]

[2023, 2003, 1037, 2862, 1997, 11746]

In [13]:
inputs['input_ids'][1][1]

[1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012]

In [14]:
inputs['input_ids'][1][2]

[2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012]

In [15]:
inputs['input_ids'][1][3]

[1045, 2572, 5458, 1012]

In [17]:
'''Now we won't receive error.'''
import torch
padded_input_ids = [[1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012],
                    [2023, 2003, 1037, 2862, 1997, 11746,   0,    0,    0,    0],
                    [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012,    0,    0],
                    [1045, 2572, 5458, 1012,    0,    0,    0,    0,    0,    0]]

padded_input_ids = torch.tensor(padded_input_ids)    