# Models expect to a batch of inputs

In [1]:
import torch 
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [7]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print("Tokenized Inputs", tokenized_inputs)

tokens = tokenizer.tokenize(sequence) 
ids = tokenizer.convert_tokens_to_ids(tokens) 
input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)
output = model(input_ids)

print("Logits:", output.logits)


Tokenized Inputs {'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [9]:
batched_ids = [ids, ids]
print(batched_ids)

[[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]]


Now convert this batched_ids to a tensor and see the result

In [11]:
tensor = torch.tensor(batched_ids)
print(tensor)

tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])


## Padding inputs
### The following list of lists cannot be converted to a tensor 
```
batched_ids = [
    [200, 200, 200],
    [200, 200]
]
```

in order to work around this we will use padding to make sure our tensors have a rectangular shape. Padding makes sure all out sentences have the same length by add a special word called padding token to sentences with fewer values. For ex, if you have 10 sentences with 10 words and 1 sentence with 20 words, padding will ensure all the sentences have 20 words, In our example, the resulting tensor looks like... 
<br>
```
padding_id = 100

batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id],
]

```

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batch_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batch_ids)).logits)

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


## Attention masks 
Attention masks are tensors with the exact same shape as their input IDs tensor, filled wiht 0s and 1s such that the 1s indicate the coresponding tokens should be attended to and the 0s indicate the corressponding tokens should not be attended to

In [19]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)


tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
