In [1]:
from transformers import AutoTokenizer

In [3]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
sentence = 'I am learning LLms and right now Transformers are on my radar'

In [5]:
model_inputs = tokenizer(sentence)

In [6]:
model_inputs

{'input_ids': [101, 1045, 2572, 4083, 2222, 5244, 1998, 2157, 2085, 19081, 2024, 2006, 2026, 7217, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
sequences = ['This has been a good day', 'this is the second sequence']

In [8]:
model_inputs = tokenizer(sequences)

In [9]:
model_inputs

{'input_ids': [[101, 2023, 2038, 2042, 1037, 2204, 2154, 102], [101, 2023, 2003, 1996, 2117, 5537, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

In [10]:
# We cam add paddings too with tokenizers
model_inputs = tokenizer(sequences, padding= 'longest')# It will pad to the max sequence length
model_inputs

{'input_ids': [[101, 2023, 2038, 2042, 1037, 2204, 2154, 102], [101, 2023, 2003, 1996, 2117, 5537, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0]]}

In [11]:
# Below code will pad sequence upto the max length for the model (512 for Bert)
model_inputs= tokenizer(sequences, padding= 'max_length')
model_inputs

{'input_ids': [[101, 2023, 2038, 2042, 1037, 2204, 2154, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
# Will pad the sequences up to the specified max length
model_input = tokenizer(sequences, padding = 'max_length', max_length=10)
model_input

{'input_ids': [[101, 2023, 2038, 2042, 1037, 2204, 2154, 102, 0, 0], [101, 2023, 2003, 1996, 2117, 5537, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [13]:
sentence = 'It has been a very good day for me'
model_inputs = tokenizer(sentence, truncation=True)# will truncate the sequence if exceeds the max length of the model
model_inputs

{'input_ids': [101, 2009, 2038, 2042, 1037, 2200, 2204, 2154, 2005, 2033, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
# we can get the model_ids as pytorch tensors or numpy array as well using tokenizer
model_inputs = tokenizer(sentence, padding=True, return_tensors = 'pt')
model_inputs # it will give pytorch tensor

{'input_ids': tensor([[ 101, 2009, 2038, 2042, 1037, 2200, 2204, 2154, 2005, 2033,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
model_inputs = tokenizer(sentence, padding = True, return_tensors = 'np')
model_inputs # it will return input_ids as numpy arrays

{'input_ids': array([[ 101, 2009, 2038, 2042, 1037, 2200, 2204, 2154, 2005, 2033,  102]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
# Special tokens in teh model
sentence = 'It has been a very good day for me'
model_inputs = tokenizer(sentence)
print('Direct input_ids: ',model_inputs['input_ids'])

Direct input_ids:  [101, 2009, 2038, 2042, 1037, 2200, 2204, 2154, 2005, 2033, 102]


In [17]:
tokens = tokenizer.tokenize(sentence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print('tokenized input_ids: ',ids)# They are a bit different from the direct input_ids above

tokenized input_ids:  [2009, 2038, 2042, 1037, 2200, 2204, 2154, 2005, 2033]


In [18]:
# now we decode both ids to see the difference
print('Sentence after decoding direct input_ids: ',tokenizer.decode(model_inputs['input_ids']))
print()
print('Sentence after decoding indirect input_ids: ',tokenizer.decode(ids)) # Direct input_ids also give the special tokens [CLS] and [SEP] of the model used

Sentence after decoding direct input_ids:  [CLS] it has been a very good day for me [SEP]

Sentence after decoding indirect input_ids:  it has been a very good day for me


In [19]:
# Now we wrap up all the processes we did above into a single process which will do padding, truncation and handle multiple sequences by itself
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ['This has been a good day', 'this is the second sequence']

In [20]:
tokens = tokenizer(sequences, padding = True, truncation=True, return_tensors = 'pt')


In [21]:
output = model(**tokens)

In [22]:
output

SequenceClassifierOutput(loss=None, logits=tensor([[-4.1322,  4.5354],
        [-2.6268,  2.6883]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)