In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [2]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [5]:
sentence = 'I am going to become one of the greatest men ever lived on planet Earth'

In [6]:
tokens = tokenizer.tokenize(sentence)

In [7]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [8]:
input_ids = torch.tensor(token_ids)

In [9]:
# This Line will fail
model(input_ids)# It is because transformer models expects multiple sequences by default but we sent only one

IndexError: too many indices for tensor of dimension 1

In [12]:
tokenized_inputs = tokenizer(sentence, return_tensors='pt')
print(tokenized_inputs['input_ids'])# Now this did not give error its is because it added a dimension on top of the converted list

tensor([[ 101, 1045, 2572, 2183, 2000, 2468, 2028, 1997, 1996, 4602, 2273, 2412,
         2973, 2006, 4774, 3011,  102]])


In [13]:
tokenized_inputs['input_ids'].shape

torch.Size([1, 17])

In [14]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [16]:
sentence = 'I will become one of the greatest men ever lived on planet Earth'
tokens = tokenizer.tokenize(sentence)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [17]:
input_ids = torch.tensor([ids])# now it did not give error because we added one dimension to the list(ids) which is converted to tensor '[]'

In [18]:
print('Input_IDs', input_ids)

Input_IDs tensor([[1045, 2097, 2468, 2028, 1997, 1996, 4602, 2273, 2412, 2973, 2006, 4774,
         3011]])


In [19]:
output = model(input_ids)

In [20]:
print('logits: ', output.logits)

logits:  tensor([[-4.0116,  4.3434]], grad_fn=<AddmmBackward0>)


In [21]:
batched_ids= [ids,ids]

In [24]:
batched_ids_tensor = torch.tensor(batched_ids)

In [26]:
batched_ids_tensor.shape

torch.Size([2, 13])

In [27]:
ouput_batched_ids = model(batched_ids_tensor)

In [29]:
print('Batched_ids_logit: ', ouput_batched_ids.logits) # It gives the same logits as above but doubled hence proved batching does not change result

Batched_ids_logit:  tensor([[-4.0116,  4.3434],
        [-4.0116,  4.3434]], grad_fn=<AddmmBackward0>)


Padding and Markdown

In [30]:
batched_ids = [
    [200,200,200],
    [200,200]
]

In [31]:
# we will have to use padding to make the shape rectangular
padding_id = 100
batched_ids = [
    [200,200,200],
    [200,200,padding_id]
]

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [36]:
sequence_ids1 = [[200,200,200]]
sequence_ids2 = [[200,200]]
batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]

In [37]:
print(model(torch.tensor(sequence_ids1)).logits)
print(model(torch.tensor(sequence_ids2)).logits)
print(model(torch.tensor(batched_ids)).logits) # The logits of the second row will be different because we donot have atention layers here like in transformers which would have ignored the padding

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [38]:
# Attention Masks
batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id]
]

In [39]:
attention_mask = [
    [1,1,1],
    [1,1,0]
]

In [41]:
outputs = model(torch.tensor(batched_ids), attention_mask = torch.tensor(attention_mask))

In [42]:
print('Output lohits: ', outputs.logits)# now we would have correcrt logits after applying the attetion mask

Output lohits:  tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [43]:
sentence

'I will become one of the greatest men ever lived on planet Earth'

In [44]:
tokens = tokenizer.tokenize(sentence)

In [46]:
tokens

['i',
 'will',
 'become',
 'one',
 'of',
 'the',
 'greatest',
 'men',
 'ever',
 'lived',
 'on',
 'planet',
 'earth']

In [47]:
len(tokens)

13

In [48]:
sentence2 = 'I am learning LLMs'

In [49]:
tokens2 = tokenizer.tokenize(sentence2)

In [50]:
len(tokens2)

5

In [52]:
ids1 = tokenizer.convert_tokens_to_ids(tokens)
ids2 = tokenizer.convert_tokens_to_ids(tokens2)

In [54]:
print('sentence1 logits: ', model(torch.tensor([ids1])).logits)
print('sentence2 logits: ', model(torch.tensor([ids2])).logits)

sentence1 logits:  tensor([[-4.0116,  4.3434]], grad_fn=<AddmmBackward0>)
sentence2 logits:  tensor([[ 1.0627, -0.9045]], grad_fn=<AddmmBackward0>)


In [59]:
print(ids[0])

1045


In [73]:
batched_ids = [ids1[0],ids2[0]]


In [74]:
batched_ids_tensor = torch.tensor(batched_ids)

In [75]:
batched_ids_tensor.shape

torch.Size([2])

In [76]:
print('Batched Ids Logits: ', model(batched_ids_tensor).logits)

IndexError: too many indices for tensor of dimension 1