In [3]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentences = [
    "I've been waiting for a EmKa Academy course my whole life.",
    "I love this."
]
tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids[0])
print(ids[1])

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2026, 2878, 2166, 1012]
[1045, 2293, 2023, 1012]


In [7]:
import tensorflow as tf

ids = [
    [1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012],
    [1045, 2293, 2023, 1012]
]

input_ids = tf.constant(ids) # This happens because of different lengths. They have to be in same length to be tensored.

ValueError: Can't convert non-rectangular Python sequence to Tensor.

In [8]:
ids = [
    [1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012],
    [1045, 2293, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]

input_ids = tf.constant(ids)
input_ids

<tf.Tensor: shape=(2, 15), dtype=int32, numpy=
array([[1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607,
        2878, 2026, 2166, 1012],
       [1045, 2293, 2023, 1012,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]])>

### Padding

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token_id

0

### All Scenario

In [10]:
from transformers import TFAutoModelForSequenceClassification

ids1 = tf.constant([[1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012]])

ids2 = tf.constant([[1045, 2293, 2023, 1012]])

all_ids = tf.constant(
    [
    [1045, 1005, 2310, 2042, 3403, 2005, 1037, 7861, 2912, 2914, 2607, 2878, 2026, 2166, 1012],
    [1045, 2293, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # attention layers to ignore the padding tokens. Thats why we need to pass them an attention mask.
]
)

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
print(model(ids1).logits)
print(model(ids2).logits)
print(model(all_ids).logits) # ids2 logits changed.

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


tf.Tensor([[ 0.38173926 -0.2528047 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[-4.135063   4.4688683]], shape=(1, 2), dtype=float32)
tf.Tensor(
[[ 0.3817402  -0.25280547]
 [-2.3545122   2.4671905 ]], shape=(2, 2), dtype=float32)


In [11]:
attention_mask = tf.constant(
    [
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # attention layers to ignore the padding tokens. Thats why we need to pass them an attention mask.
]
)

In [12]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
output1 = model(ids1)
output2 = model(ids2)
print(output1.logits)
print(output2.logits)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


tf.Tensor([[ 0.38173926 -0.2528047 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[-4.135063   4.4688683]], shape=(1, 2), dtype=float32)


In [13]:
output = model(all_ids, attention_mask = attention_mask)
print(output.logits) # Same results now.

tf.Tensor(
[[ 0.3817402  -0.25280547]
 [-4.135063    4.468869  ]], shape=(2, 2), dtype=float32)


In [None]:
# Done