In [4]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a EmKa Academy course whole my life.",
    "I love this."
]

tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids[0])
ids[1]

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012]


[1045, 2293, 2023, 1012]

In [5]:
import torch

input_ids = torch.tensor(ids) # This happens because of different lengths. They have to be in same length to be tensored.

ValueError: expected sequence of length 15 at dim 1 (got 4)

In [8]:
import torch

ids = [
    [1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012],
    [1045, 2293, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]

input_ids = torch.tensor(ids)
input_ids

tensor([[1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878,
         2026, 2166, 1012],
        [1045, 2293, 2023, 1012,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0]])

### Padding

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token_id

0

### All Scenario

In [11]:
from transformers import AutoModelForSequenceClassification

ids1 = torch.tensor([[1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012]])

ids2 = torch.tensor([[1045, 2293, 2023, 1012]])

all_ids = torch.tensor(
    [
    [1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012],
    [1045, 2293, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # attention layers to ignore the padding tokens. Thats why we need to pass them an attention mask.
]
)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
print(model(ids1).logits)
print(model(ids2).logits)
print(model(all_ids).logits) # ids2 logits changed.

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 0.3817, -0.2528]], grad_fn=<AddmmBackward0>)
tensor([[-4.1351,  4.4689]], grad_fn=<AddmmBackward0>)
tensor([[ 0.3817, -0.2528],
        [-2.3545,  2.4672]], grad_fn=<AddmmBackward0>)


In [12]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
output1 = model(ids1)
output2 = model(ids2)
print(output1.logits)
print(output2.logits)

tensor([[ 0.3817, -0.2528]], grad_fn=<AddmmBackward0>)
tensor([[-4.1351,  4.4689]], grad_fn=<AddmmBackward0>)


In [21]:
attention_mask =  torch.tensor(   [
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # attention layers to ignore the padding tokens. Thats why we need to pass them an attention mask.
])

output = model(all_ids , attention_mask = attention_mask)
print(output.logits) # As you see same results now. 

tensor([[ 0.3817, -0.2528],
        [-4.1351,  4.4689]], grad_fn=<AddmmBackward0>)


### All in one

In [23]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a EmKa Academy course my whole life.",
    "I love this."
]
print(tokenizer(sentences, padding= True))

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 2293, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


In [None]:
# Done.